{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 5000, "global_step": 5421, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 614.5, "completions/mean_terminated_length": 614.5, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.00018446781036709093, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 9300.0, "reward": 0.9318181872367859, "reward_std": 0.7186995148658752, "rewards/fixed_code_pass_all_test_reward/mean": 0.3068181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.2993006408214569, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 400.625, "completions/mean_terminated_length": 400.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.00036893562073418186, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.0, "learning_rate": 3.683241252302026e-08, "loss": 0.0, "num_tokens": 17809.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 418.625, "completions/mean_terminated_length": 418.625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.0005534034311012728, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0002503307196093374, "learning_rate": 7.366482504604052e-08, "loss": 0.0, "num_tokens": 27390.0, "reward": 1.0208332538604736, "reward_std": 0.682883083820343, "rewards/fixed_code_pass_all_test_reward/mean": 0.3958333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.4494926333427429, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0007378712414683637, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.00014034915056981845, "learning_rate": 1.1049723756906078e-07, "loss": 0.0, "num_tokens": 36391.0, "reward": 0.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 375.5, "completions/mean_terminated_length": 375.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.0009223390518354548, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.00021548542099480983, "learning_rate": 1.4732965009208104e-07, "loss": 0.0, "num_tokens": 43643.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 446.125, "completions/mean_terminated_length": 446.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.0011068068622025456, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.00022794705000706017, "learning_rate": 1.8416206261510132e-07, "loss": 0.0, "num_tokens": 53764.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 475.375, "completions/mean_terminated_length": 475.375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0012912746725696365, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0003054401186091127, "learning_rate": 2.2099447513812156e-07, "loss": 0.0, "num_tokens": 62991.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 312.375, "completions/mean_terminated_length": 312.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.0014757424829367274, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.00021718430161854485, "learning_rate": 2.5782688766114184e-07, "loss": 0.0, "num_tokens": 68610.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 575.625, "completions/mean_terminated_length": 575.625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.0016602102933038186, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.00020346231349321897, "learning_rate": 2.946593001841621e-07, "loss": 0.0, "num_tokens": 80327.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 352.125, "completions/mean_terminated_length": 352.125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0018446781036709095, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.00023884191250544973, "learning_rate": 3.3149171270718233e-07, "loss": 0.0, "num_tokens": 87792.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 389.5, "completions/mean_terminated_length": 389.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.0020291459140380002, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.00023631559815839864, "learning_rate": 3.6832412523020263e-07, "loss": 0.0, "num_tokens": 93676.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 590.25, "completions/mean_terminated_length": 590.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.002213613724405091, "frac_reward_zero_std": 0.0, "grad_norm": 8.625, "kl": 0.00026661740594136063, "learning_rate": 4.051565377532229e-07, "loss": 0.0, "num_tokens": 108134.0, "reward": 0.625, "reward_std": 0.6232117414474487, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.002398081534772182, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.00018176211960962974, "learning_rate": 4.419889502762431e-07, "loss": 0.0, "num_tokens": 114151.0, "reward": 0.5769230723381042, "reward_std": 0.5667603015899658, "rewards/fixed_code_pass_all_test_reward/mean": 0.32692307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.2024783194065094, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 355.25, "completions/mean_terminated_length": 355.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.002582549345139273, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.00019514420409905142, "learning_rate": 4.788213627992634e-07, "loss": 0.0, "num_tokens": 122033.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 547.875, "completions/mean_terminated_length": 333.5714416503906, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.002767017155506364, "frac_reward_zero_std": 0.0, "grad_norm": 6.78125, "kl": 0.00022281866040430032, "learning_rate": 5.156537753222837e-07, "loss": 0.0, "num_tokens": 132256.0, "reward": 1.3568181991577148, "reward_std": 0.7189089059829712, "rewards/fixed_code_pass_all_test_reward/mean": 0.7318181991577148, "rewards/fixed_code_pass_all_test_reward/std": 0.32375454902648926, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 354.5, "completions/mean_terminated_length": 354.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.002951484965873455, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.0001778191526682349, "learning_rate": 5.524861878453039e-07, "loss": 0.0, "num_tokens": 138028.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 428.25, "completions/mean_terminated_length": 428.25, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.003135952776240546, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.00014581392497348133, "learning_rate": 5.893186003683242e-07, "loss": 0.0, "num_tokens": 146558.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 418.875, "completions/mean_terminated_length": 418.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.003320420586607637, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.0002042061896645464, "learning_rate": 6.261510128913445e-07, "loss": 0.0, "num_tokens": 154101.0, "reward": 0.5625, "reward_std": 0.5315677523612976, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.07363035529851913, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.003504888396974728, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0002542335087127867, "learning_rate": 6.629834254143647e-07, "loss": 0.0, "num_tokens": 159377.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 649.125, "completions/mean_terminated_length": 649.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.003689356207341819, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.00016768612795203808, "learning_rate": 6.99815837937385e-07, "loss": 0.0, "num_tokens": 175170.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 457.25, "completions/mean_terminated_length": 230.00001525878906, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.00387382401770891, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.00029403330154309515, "learning_rate": 7.366482504604053e-07, "loss": 0.0, "num_tokens": 181780.0, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 442.375, "completions/mean_terminated_length": 442.375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.0040582918280760005, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.00017682783800410107, "learning_rate": 7.734806629834254e-07, "loss": 0.0, "num_tokens": 194695.0, "reward": 0.6481480598449707, "reward_std": 0.5267673134803772, "rewards/fixed_code_pass_all_test_reward/mean": 0.023148149251937866, "rewards/fixed_code_pass_all_test_reward/std": 0.019168488681316376, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 532.125, "completions/mean_terminated_length": 532.125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.004242759638443091, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.0001325656339759007, "learning_rate": 8.103130755064458e-07, "loss": 0.0, "num_tokens": 204296.0, "reward": 0.5107142925262451, "reward_std": 0.538367509841919, "rewards/fixed_code_pass_all_test_reward/mean": 0.010714286006987095, "rewards/fixed_code_pass_all_test_reward/std": 0.006613001227378845, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 572.875, "completions/mean_terminated_length": 572.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.004427227448810182, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0003224414467695169, "learning_rate": 8.471454880294661e-07, "loss": 0.0, "num_tokens": 219967.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 520.875, "completions/mean_terminated_length": 520.875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.004611695259177273, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.00021408604879979976, "learning_rate": 8.839779005524863e-07, "loss": 0.0, "num_tokens": 234070.0, "reward": 0.8154761791229248, "reward_std": 0.5933643579483032, "rewards/fixed_code_pass_all_test_reward/mean": 0.1904762089252472, "rewards/fixed_code_pass_all_test_reward/std": 0.26574185490608215, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.004796163069544364, "frac_reward_zero_std": 0.0, "grad_norm": 3.296875, "kl": 0.00026074393645103555, "learning_rate": 9.208103130755065e-07, "loss": 0.0, "num_tokens": 241156.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 433.0, "completions/mean_terminated_length": 202.2857208251953, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.004980630879911455, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.00021680369809473632, "learning_rate": 9.576427255985269e-07, "loss": 0.0, "num_tokens": 247476.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 439.125, "completions/mean_terminated_length": 439.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.005165098690278546, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.0002649242615007097, "learning_rate": 9.94475138121547e-07, "loss": 0.0, "num_tokens": 257165.0, "reward": 0.9047619104385376, "reward_std": 0.7563573122024536, "rewards/fixed_code_pass_all_test_reward/mean": 0.5297619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.4162534475326538, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 327.5, "completions/mean_terminated_length": 327.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.005349566500645637, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.00020057262190675829, "learning_rate": 1.0313075506445673e-06, "loss": 0.0, "num_tokens": 266793.0, "reward": 1.3689024448394775, "reward_std": 0.6324605345726013, "rewards/fixed_code_pass_all_test_reward/mean": 0.49390244483947754, "rewards/fixed_code_pass_all_test_reward/std": 0.36591172218322754, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 369.875, "completions/mean_terminated_length": 369.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.005534034311012728, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.0002542876254665316, "learning_rate": 1.0681399631675876e-06, "loss": 0.0, "num_tokens": 274712.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.005718502121379819, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.00024302543852172676, "learning_rate": 1.1049723756906078e-06, "loss": 0.0, "num_tokens": 283811.0, "reward": 1.01953125, "reward_std": 0.4664938151836395, "rewards/fixed_code_pass_all_test_reward/mean": 0.39453125, "rewards/fixed_code_pass_all_test_reward/std": 0.32040777802467346, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 350.625, "completions/mean_terminated_length": 350.625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.00590296993174691, "frac_reward_zero_std": 0.0, "grad_norm": 4.59375, "kl": 0.00036257309147913475, "learning_rate": 1.141804788213628e-06, "loss": 0.0, "num_tokens": 290568.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 456.125, "completions/mean_terminated_length": 456.125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.006087437742114001, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.00027114486510981806, "learning_rate": 1.1786372007366483e-06, "loss": 0.0, "num_tokens": 301049.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 316.75, "completions/mean_terminated_length": 316.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.006271905552481092, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.00020182474690955132, "learning_rate": 1.2154696132596686e-06, "loss": 0.0, "num_tokens": 306751.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 460.0, "completions/mean_terminated_length": 460.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.006456373362848183, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.00013817371177538007, "learning_rate": 1.252302025782689e-06, "loss": 0.0, "num_tokens": 314647.0, "reward": 0.6071428656578064, "reward_std": 0.5050762891769409, "rewards/fixed_code_pass_all_test_reward/mean": 0.6071428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.5050762891769409, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 470.75, "completions/mean_terminated_length": 470.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.006640841173215274, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.0002697959334909683, "learning_rate": 1.289134438305709e-06, "loss": 0.0, "num_tokens": 326653.0, "reward": 0.675000011920929, "reward_std": 0.5750776529312134, "rewards/fixed_code_pass_all_test_reward/mean": 0.05000000074505806, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 541.25, "completions/mean_terminated_length": 541.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.006825308983582365, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.0002184828645113157, "learning_rate": 1.3259668508287293e-06, "loss": 0.0, "num_tokens": 338343.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 424.125, "completions/mean_terminated_length": 424.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.007009776793949456, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.0002829425875461311, "learning_rate": 1.3627992633517498e-06, "loss": 0.0, "num_tokens": 346176.0, "reward": 0.34183675050735474, "reward_std": 0.6330526471138, "rewards/fixed_code_pass_all_test_reward/mean": 0.09183673560619354, "rewards/fixed_code_pass_all_test_reward/std": 0.17039811611175537, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 510.375, "completions/mean_terminated_length": 510.375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.007194244604316547, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.00022559235549124423, "learning_rate": 1.39963167587477e-06, "loss": 0.0, "num_tokens": 355571.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 419.0, "completions/mean_terminated_length": 419.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.007378712414683638, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0003436515344219515, "learning_rate": 1.43646408839779e-06, "loss": 0.0, "num_tokens": 363419.0, "reward": 0.375, "reward_std": 0.7126409411430359, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2815771996974945, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 311.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.007563180225050729, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.0001629171574677457, "learning_rate": 1.4732965009208105e-06, "loss": 0.0, "num_tokens": 370596.0, "reward": 0.7265625, "reward_std": 0.5569543242454529, "rewards/fixed_code_pass_all_test_reward/mean": 0.1015625, "rewards/fixed_code_pass_all_test_reward/std": 0.0941212922334671, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 469.125, "completions/mean_terminated_length": 469.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.00774764803541782, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.0002483787520759506, "learning_rate": 1.5101289134438308e-06, "loss": 0.0, "num_tokens": 380213.0, "reward": 0.4353448450565338, "reward_std": 0.5737409591674805, "rewards/fixed_code_pass_all_test_reward/mean": 0.06034483015537262, "rewards/fixed_code_pass_all_test_reward/std": 0.06834698468446732, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 364.375, "completions/mean_terminated_length": 364.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.00793211584578491, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.0002931401595560601, "learning_rate": 1.5469613259668508e-06, "loss": 0.0, "num_tokens": 389792.0, "reward": 0.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 341.25, "completions/mean_terminated_length": 341.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.008116583656152001, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.00021199880939093418, "learning_rate": 1.5837937384898713e-06, "loss": 0.0, "num_tokens": 395586.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 313.0, "completions/mean_terminated_length": 313.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.008301051466519093, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.0002819107576215174, "learning_rate": 1.6206261510128915e-06, "loss": 0.0, "num_tokens": 402978.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 345.875, "completions/mean_terminated_length": 345.875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.008485519276886183, "frac_reward_zero_std": 0.0, "grad_norm": 4.28125, "kl": 0.0002345051325391978, "learning_rate": 1.6574585635359118e-06, "loss": 0.0, "num_tokens": 409681.0, "reward": 0.734375, "reward_std": 0.7453351616859436, "rewards/fixed_code_pass_all_test_reward/mean": 0.359375, "rewards/fixed_code_pass_all_test_reward/std": 0.40881744027137756, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 491.0, "completions/mean_terminated_length": 491.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.008669987087253275, "frac_reward_zero_std": 0.0, "grad_norm": 3.28125, "kl": 0.0002542476486269152, "learning_rate": 1.6942909760589322e-06, "loss": 0.0, "num_tokens": 419497.0, "reward": 0.7716836929321289, "reward_std": 0.7586972117424011, "rewards/fixed_code_pass_all_test_reward/mean": 0.2716836631298065, "rewards/fixed_code_pass_all_test_reward/std": 0.3891312777996063, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 463.25, "completions/mean_terminated_length": 463.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.008854454897620365, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.0002121884244843386, "learning_rate": 1.7311233885819523e-06, "loss": 0.0, "num_tokens": 431763.0, "reward": 0.5949074029922485, "reward_std": 0.5379428267478943, "rewards/fixed_code_pass_all_test_reward/mean": 0.09490740299224854, "rewards/fixed_code_pass_all_test_reward/std": 0.16576401889324188, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 374.25, "completions/mean_terminated_length": 374.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.009038922707987456, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0002037937911154586, "learning_rate": 1.7679558011049725e-06, "loss": 0.0, "num_tokens": 438189.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 452.25, "completions/mean_terminated_length": 452.25, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.009223390518354546, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.00021095320789754624, "learning_rate": 1.804788213627993e-06, "loss": 0.0, "num_tokens": 446815.0, "reward": 0.6666666865348816, "reward_std": 0.6172134280204773, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 418.5, "completions/mean_terminated_length": 418.5, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.009407858328721638, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.000279675593446882, "learning_rate": 1.841620626151013e-06, "loss": 0.0, "num_tokens": 455531.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.009592326139088728, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.00024085327095235698, "learning_rate": 1.8784530386740332e-06, "loss": 0.0, "num_tokens": 463372.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 300.375, "completions/mean_terminated_length": 300.375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.00977679394945582, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.0004095350468560355, "learning_rate": 1.9152854511970537e-06, "loss": 0.0, "num_tokens": 468639.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 463.75, "completions/mean_terminated_length": 463.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.00996126175982291, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.00023389830903397524, "learning_rate": 1.9521178637200737e-06, "loss": 0.0, "num_tokens": 478837.0, "reward": 0.3863636255264282, "reward_std": 0.5339699983596802, "rewards/fixed_code_pass_all_test_reward/mean": 0.011363636702299118, "rewards/fixed_code_pass_all_test_reward/std": 0.032141219824552536, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 524.75, "completions/mean_terminated_length": 524.75, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.010145729570190002, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.0003300023581687128, "learning_rate": 1.988950276243094e-06, "loss": 0.0, "num_tokens": 492363.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 362.25, "completions/mean_terminated_length": 362.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.010330197380557092, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.000311942389998876, "learning_rate": 2.0257826887661147e-06, "loss": 0.0, "num_tokens": 502085.0, "reward": 1.2145522832870483, "reward_std": 0.6858223080635071, "rewards/fixed_code_pass_all_test_reward/mean": 0.46455222368240356, "rewards/fixed_code_pass_all_test_reward/std": 0.4221862852573395, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 391.875, "completions/mean_terminated_length": 391.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.010514665190924184, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.00028297298376855906, "learning_rate": 2.0626151012891347e-06, "loss": 0.0, "num_tokens": 512604.0, "reward": 1.0223214626312256, "reward_std": 0.6906649470329285, "rewards/fixed_code_pass_all_test_reward/mean": 0.2723214030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.3272920548915863, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 366.125, "completions/mean_terminated_length": 366.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.010699133001291274, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.00022805896696809214, "learning_rate": 2.0994475138121547e-06, "loss": 0.0, "num_tokens": 518365.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 514.0, "completions/mean_terminated_length": 294.8571472167969, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.010883600811658366, "frac_reward_zero_std": 0.0, "grad_norm": 3.078125, "kl": 0.0002971382036776049, "learning_rate": 2.136279926335175e-06, "loss": 0.0, "num_tokens": 525317.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.011068068622025456, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.000340497397701256, "learning_rate": 2.1731123388581952e-06, "loss": 0.0, "num_tokens": 531014.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 420.25, "completions/mean_terminated_length": 420.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.011252536432392548, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0003046246165467892, "learning_rate": 2.2099447513812157e-06, "loss": 0.0, "num_tokens": 538912.0, "reward": 0.6458333134651184, "reward_std": 0.5377606153488159, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 264.5, "completions/mean_terminated_length": 264.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.011437004242759638, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.00035895742621505633, "learning_rate": 2.246777163904236e-06, "loss": 0.0, "num_tokens": 543988.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 801.5, "completions/mean_terminated_length": 623.4285888671875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.01162147205312673, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.00029784061462123645, "learning_rate": 2.283609576427256e-06, "loss": 0.0, "num_tokens": 553576.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 566.625, "completions/mean_terminated_length": 566.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.01180593986349382, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.00033613196319493, "learning_rate": 2.320441988950276e-06, "loss": 0.0, "num_tokens": 565733.0, "reward": 0.6428571343421936, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 335.875, "completions/mean_terminated_length": 335.875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.011990407673860911, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "kl": 0.00035641383055917686, "learning_rate": 2.3572744014732967e-06, "loss": 0.0, "num_tokens": 573364.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.012174875484228001, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.0003659196681837784, "learning_rate": 2.394106813996317e-06, "loss": 0.0, "num_tokens": 578192.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 373.625, "completions/mean_terminated_length": 373.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.012359343294595093, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.00021983245142109809, "learning_rate": 2.430939226519337e-06, "loss": 0.0, "num_tokens": 590453.0, "reward": 0.7250000238418579, "reward_std": 0.5548487305641174, "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, "rewards/fixed_code_pass_all_test_reward/std": 0.10690450668334961, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 478.375, "completions/mean_terminated_length": 478.375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.012543811104962183, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0003333043514430756, "learning_rate": 2.4677716390423576e-06, "loss": 0.0, "num_tokens": 599192.0, "reward": 0.5416666865348816, "reward_std": 0.46929532289505005, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 528.875, "completions/mean_terminated_length": 528.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.012728278915329275, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0003280547934991773, "learning_rate": 2.504604051565378e-06, "loss": 0.0, "num_tokens": 612423.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 587.875, "completions/mean_terminated_length": 587.875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.012912746725696367, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.0003079277175856987, "learning_rate": 2.541436464088398e-06, "loss": 0.0, "num_tokens": 624206.0, "reward": 0.8888888955116272, "reward_std": 0.32563868165016174, "rewards/fixed_code_pass_all_test_reward/mean": 0.013888888992369175, "rewards/fixed_code_pass_all_test_reward/std": 0.029695695266127586, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.013097214536063457, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.00020002496103188605, "learning_rate": 2.578268876611418e-06, "loss": 0.0, "num_tokens": 630297.0, "reward": 1.2678570747375488, "reward_std": 0.7988238334655762, "rewards/fixed_code_pass_all_test_reward/mean": 0.5178571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.3576526939868927, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.013281682346430549, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.00044113619878771715, "learning_rate": 2.6151012891344386e-06, "loss": 0.0, "num_tokens": 636209.0, "reward": 0.8916666507720947, "reward_std": 0.6057265996932983, "rewards/fixed_code_pass_all_test_reward/mean": 0.2666666805744171, "rewards/fixed_code_pass_all_test_reward/std": 0.24688535928726196, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.013466150156797639, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.0003353912106831558, "learning_rate": 2.6519337016574586e-06, "loss": 0.0, "num_tokens": 643754.0, "reward": 1.3232758045196533, "reward_std": 0.43925654888153076, "rewards/fixed_code_pass_all_test_reward/mean": 0.3232758641242981, "rewards/fixed_code_pass_all_test_reward/std": 0.43925657868385315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.01365061796716473, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.0008507270867994521, "learning_rate": 2.6887661141804787e-06, "loss": 0.0, "num_tokens": 648793.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 462.625, "completions/mean_terminated_length": 462.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.01383508577753182, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.0005278029329929268, "learning_rate": 2.7255985267034996e-06, "loss": 0.0, "num_tokens": 655454.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 624.75, "completions/mean_terminated_length": 624.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.014019553587898912, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.00041691823389555793, "learning_rate": 2.7624309392265196e-06, "loss": 0.0, "num_tokens": 669380.0, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 615.25, "completions/mean_terminated_length": 615.25, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.014204021398266002, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.0005165893489902373, "learning_rate": 2.79926335174954e-06, "loss": 0.0, "num_tokens": 681510.0, "reward": 0.6370192170143127, "reward_std": 0.5285026431083679, "rewards/fixed_code_pass_all_test_reward/mean": 0.012019230984151363, "rewards/fixed_code_pass_all_test_reward/std": 0.03399552032351494, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 406.125, "completions/mean_terminated_length": 406.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.014388489208633094, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0002721593864407623, "learning_rate": 2.83609576427256e-06, "loss": 0.0, "num_tokens": 693167.0, "reward": 1.3194444179534912, "reward_std": 0.3914315700531006, "rewards/fixed_code_pass_all_test_reward/mean": 0.3194444477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.391431599855423, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 411.5, "completions/mean_terminated_length": 411.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.014572957019000184, "frac_reward_zero_std": 0.0, "grad_norm": 4.09375, "kl": 0.0006489746410807129, "learning_rate": 2.87292817679558e-06, "loss": 0.0, "num_tokens": 703187.0, "reward": 0.41826921701431274, "reward_std": 0.5170129537582397, "rewards/fixed_code_pass_all_test_reward/mean": 0.04326923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.05971721187233925, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 321.0, "completions/mean_terminated_length": 321.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.014757424829367276, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.0009333192720077932, "learning_rate": 2.9097605893186006e-06, "loss": 0.0, "num_tokens": 709923.0, "reward": 0.2724999785423279, "reward_std": 0.4686683118343353, "rewards/fixed_code_pass_all_test_reward/mean": 0.022499999031424522, "rewards/fixed_code_pass_all_test_reward/std": 0.03284161165356636, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 429.25, "completions/mean_terminated_length": 429.25, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.014941892639734366, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.0003500412858556956, "learning_rate": 2.946593001841621e-06, "loss": 0.0, "num_tokens": 718509.0, "reward": 0.26838237047195435, "reward_std": 0.47980406880378723, "rewards/fixed_code_pass_all_test_reward/mean": 0.018382353708148003, "rewards/fixed_code_pass_all_test_reward/std": 0.03493338078260422, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 668.5, "completions/mean_terminated_length": 668.5, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.015126360450101458, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.0002641449391376227, "learning_rate": 2.983425414364641e-06, "loss": 0.0, "num_tokens": 729921.0, "reward": 0.6785714626312256, "reward_std": 0.5164918303489685, "rewards/fixed_code_pass_all_test_reward/mean": 0.1785714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.1478712111711502, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 196.75, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.015310828260468548, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "kl": 0.001355928343400592, "learning_rate": 3.0202578268876615e-06, "loss": 0.0001, "num_tokens": 739367.0, "reward": 0.6363636255264282, "reward_std": 0.5271084904670715, "rewards/fixed_code_pass_all_test_reward/mean": 0.011363636702299118, "rewards/fixed_code_pass_all_test_reward/std": 0.015683310106396675, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 449.375, "completions/mean_terminated_length": 449.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.01549529607083564, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.0005008893567719497, "learning_rate": 3.0570902394106816e-06, "loss": 0.0, "num_tokens": 750402.0, "reward": 0.9090909361839294, "reward_std": 0.5871508717536926, "rewards/fixed_code_pass_all_test_reward/mean": 0.15909092128276825, "rewards/fixed_code_pass_all_test_reward/std": 0.19887524843215942, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.01567976388120273, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.000426482038164977, "learning_rate": 3.0939226519337016e-06, "loss": 0.0, "num_tokens": 761986.0, "reward": 0.5763888955116272, "reward_std": 0.5832126140594482, "rewards/fixed_code_pass_all_test_reward/mean": 0.0763888880610466, "rewards/fixed_code_pass_all_test_reward/std": 0.141531303524971, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.01586423169156982, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.00038498958565469366, "learning_rate": 3.130755064456722e-06, "loss": 0.0, "num_tokens": 767930.0, "reward": 1.125, "reward_std": 0.4432026147842407, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.016048699501936912, "frac_reward_zero_std": 0.0, "grad_norm": 4.6875, "kl": 0.001511697395471856, "learning_rate": 3.1675874769797425e-06, "loss": 0.0001, "num_tokens": 771944.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 521.625, "completions/mean_terminated_length": 521.625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.016233167312304002, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.0003269107037340291, "learning_rate": 3.204419889502763e-06, "loss": 0.0, "num_tokens": 779821.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.016417635122671095, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.0008808420752757229, "learning_rate": 3.241252302025783e-06, "loss": 0.0, "num_tokens": 788042.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 477.75, "completions/mean_terminated_length": 477.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.016602102933038185, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.0004472737291507656, "learning_rate": 3.278084714548803e-06, "loss": 0.0, "num_tokens": 800744.0, "reward": 1.098684310913086, "reward_std": 0.7465459704399109, "rewards/fixed_code_pass_all_test_reward/mean": 0.3486842215061188, "rewards/fixed_code_pass_all_test_reward/std": 0.37920576333999634, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.016786570743405275, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.0005590501205006149, "learning_rate": 3.3149171270718235e-06, "loss": 0.0, "num_tokens": 810043.0, "reward": 0.8020833730697632, "reward_std": 0.6407732367515564, "rewards/fixed_code_pass_all_test_reward/mean": 0.1770833432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.33165502548217773, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 699.75, "completions/mean_terminated_length": 699.75, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.016971038553772366, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0005593701534962747, "learning_rate": 3.3517495395948436e-06, "loss": 0.0, "num_tokens": 824625.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 377.125, "completions/mean_terminated_length": 377.125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.01715550636413946, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.0022984608731349, "learning_rate": 3.3885819521178644e-06, "loss": 0.0001, "num_tokens": 830578.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.01733997417450655, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.0007947759222588502, "learning_rate": 3.4254143646408845e-06, "loss": 0.0, "num_tokens": 837466.0, "reward": 0.75, "reward_std": 0.5293578505516052, "rewards/fixed_code_pass_all_test_reward/mean": 0.3750000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.264678955078125, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 424.625, "completions/mean_terminated_length": 424.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.01752444198487364, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0005154258142283652, "learning_rate": 3.4622467771639045e-06, "loss": 0.0, "num_tokens": 845935.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 357.625, "completions/mean_terminated_length": 357.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.01770890979524073, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.00047945534242899157, "learning_rate": 3.4990791896869245e-06, "loss": 0.0, "num_tokens": 855988.0, "reward": 0.8125, "reward_std": 0.6057757139205933, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.15526476502418518, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 435.5, "completions/mean_terminated_length": 435.5, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.017893377605607823, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0006367011483234819, "learning_rate": 3.535911602209945e-06, "loss": 0.0, "num_tokens": 867656.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.018077845415974913, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.0011245065688854083, "learning_rate": 3.572744014732965e-06, "loss": 0.0, "num_tokens": 872175.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 558.125, "completions/mean_terminated_length": 345.2857360839844, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.018262313226342003, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.00043920454481849447, "learning_rate": 3.609576427255986e-06, "loss": 0.0, "num_tokens": 881640.0, "reward": 0.6129031777381897, "reward_std": 0.4931521415710449, "rewards/fixed_code_pass_all_test_reward/mean": 0.11290322244167328, "rewards/fixed_code_pass_all_test_reward/std": 0.22282427549362183, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.018446781036709093, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.0017676833376754075, "learning_rate": 3.646408839779006e-06, "loss": 0.0001, "num_tokens": 886848.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.018631248847076187, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.000810316952993162, "learning_rate": 3.683241252302026e-06, "loss": 0.0, "num_tokens": 894113.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 437.25, "completions/mean_terminated_length": 437.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.018815716657443277, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0009324597122031264, "learning_rate": 3.7200736648250464e-06, "loss": 0.0, "num_tokens": 904291.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 442.625, "completions/mean_terminated_length": 442.625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.019000184467810367, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0005093587351439055, "learning_rate": 3.7569060773480665e-06, "loss": 0.0, "num_tokens": 912336.0, "reward": 0.6041666865348816, "reward_std": 0.45151862502098083, "rewards/fixed_code_pass_all_test_reward/mean": 0.1041666641831398, "rewards/fixed_code_pass_all_test_reward/std": 0.19287919998168945, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 525.5, "completions/mean_terminated_length": 308.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.019184652278177457, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.000909166990822996, "learning_rate": 3.7937384898710865e-06, "loss": 0.0, "num_tokens": 919388.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 338.5, "completions/mean_terminated_length": 338.5, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.01936912008854455, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.0016367755342798773, "learning_rate": 3.830570902394107e-06, "loss": 0.0001, "num_tokens": 925776.0, "reward": 0.7708333134651184, "reward_std": 0.7011187076568604, "rewards/fixed_code_pass_all_test_reward/mean": 0.2708333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.2946278154850006, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 772.0, "completions/mean_terminated_length": 589.7142944335938, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.01955358789891164, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0004678983023040928, "learning_rate": 3.867403314917128e-06, "loss": 0.0, "num_tokens": 935184.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.01973805570927873, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0006873999882373028, "learning_rate": 3.9042357274401475e-06, "loss": 0.0, "num_tokens": 945412.0, "reward": 1.1477272510528564, "reward_std": 0.5277147889137268, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.2748832702636719, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 375.5, "completions/mean_terminated_length": 375.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.01992252351964582, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0007842397353670094, "learning_rate": 3.941068139963168e-06, "loss": 0.0, "num_tokens": 954864.0, "reward": 1.34375, "reward_std": 0.5990133285522461, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.35591307282447815, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 529.375, "completions/mean_terminated_length": 529.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.020106991330012914, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.0006852532897028141, "learning_rate": 3.977900552486188e-06, "loss": 0.0, "num_tokens": 966571.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 452.625, "completions/mean_terminated_length": 452.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.020291459140380004, "frac_reward_zero_std": 0.0, "grad_norm": 6.53125, "kl": 0.005567755958509224, "learning_rate": 4.014732965009208e-06, "loss": 0.0002, "num_tokens": 974480.0, "reward": 1.0892857313156128, "reward_std": 0.8257429003715515, "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.4143033027648926, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 338.625, "completions/mean_terminated_length": 338.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.020475926950747094, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0007093991662259214, "learning_rate": 4.051565377532229e-06, "loss": 0.0, "num_tokens": 983453.0, "reward": 0.762499988079071, "reward_std": 0.7130166888237, "rewards/fixed_code_pass_all_test_reward/mean": 0.13750000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.3502550423145294, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.020660394761114184, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0007009400396782439, "learning_rate": 4.088397790055249e-06, "loss": 0.0, "num_tokens": 992843.0, "reward": 1.086538553237915, "reward_std": 0.346916139125824, "rewards/fixed_code_pass_all_test_reward/mean": 0.21153846383094788, "rewards/fixed_code_pass_all_test_reward/std": 0.10878566652536392, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 485.625, "completions/mean_terminated_length": 485.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.020844862571481278, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.0004369776306702988, "learning_rate": 4.125230202578269e-06, "loss": 0.0, "num_tokens": 1007120.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.021029330381848368, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.0017321194463875145, "learning_rate": 4.162062615101289e-06, "loss": 0.0001, "num_tokens": 1014188.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 431.125, "completions/mean_terminated_length": 431.125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.021213798192215458, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0014266927319113165, "learning_rate": 4.1988950276243095e-06, "loss": 0.0001, "num_tokens": 1024397.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 351.5, "completions/mean_terminated_length": 351.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.021398266002582548, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0006058404942450579, "learning_rate": 4.23572744014733e-06, "loss": 0.0, "num_tokens": 1034897.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.02158273381294964, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0006622296605200972, "learning_rate": 4.27255985267035e-06, "loss": 0.0, "num_tokens": 1041123.0, "reward": 1.4166667461395264, "reward_std": 0.7715167999267578, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.4364357888698578, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 413.625, "completions/mean_terminated_length": 413.625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.02176720162331673, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.0009666922633186914, "learning_rate": 4.309392265193371e-06, "loss": 0.0, "num_tokens": 1049552.0, "reward": 0.7375000715255737, "reward_std": 0.5629958510398865, "rewards/fixed_code_pass_all_test_reward/mean": 0.11250000447034836, "rewards/fixed_code_pass_all_test_reward/std": 0.09910312294960022, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.02195166943368382, "frac_reward_zero_std": 0.0, "grad_norm": 12.0, "kl": 0.03941391921398463, "learning_rate": 4.3462246777163904e-06, "loss": 0.0016, "num_tokens": 1054095.0, "reward": 1.125, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 604.625, "completions/mean_terminated_length": 604.625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.02213613724405091, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0007394884814857505, "learning_rate": 4.383057090239411e-06, "loss": 0.0, "num_tokens": 1064724.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 303.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.022320605054418005, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0008006801217561588, "learning_rate": 4.419889502762431e-06, "loss": 0.0, "num_tokens": 1071404.0, "reward": 0.7555555105209351, "reward_std": 0.5289502143859863, "rewards/fixed_code_pass_all_test_reward/mean": 0.13055555522441864, "rewards/fixed_code_pass_all_test_reward/std": 0.24239765107631683, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 375.25, "completions/mean_terminated_length": 375.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.022505072864785095, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0008528216421836987, "learning_rate": 4.456721915285452e-06, "loss": 0.0, "num_tokens": 1081158.0, "reward": 0.7980769276618958, "reward_std": 0.4605070948600769, "rewards/fixed_code_pass_all_test_reward/mean": 0.048076923936605453, "rewards/fixed_code_pass_all_test_reward/std": 0.057232603430747986, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 541.75, "completions/mean_terminated_length": 541.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.022689540675152185, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.0009618195545044728, "learning_rate": 4.493554327808472e-06, "loss": 0.0, "num_tokens": 1093748.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 449.625, "completions/mean_terminated_length": 449.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.022874008485519275, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.000639700549072586, "learning_rate": 4.530386740331492e-06, "loss": 0.0, "num_tokens": 1105313.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 226.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.02305847629588637, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.006158997814054601, "learning_rate": 4.567219152854512e-06, "loss": 0.0002, "num_tokens": 1110097.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 513.0, "completions/mean_terminated_length": 513.0, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.02324294410625346, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.0007430530495184939, "learning_rate": 4.604051565377533e-06, "loss": 0.0, "num_tokens": 1122537.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 455.625, "completions/mean_terminated_length": 455.625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.02342741191662055, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.0008763343430473469, "learning_rate": 4.640883977900552e-06, "loss": 0.0, "num_tokens": 1134166.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 397.375, "completions/mean_terminated_length": 397.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.02361187972698764, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.0020844780519837514, "learning_rate": 4.677716390423574e-06, "loss": 0.0001, "num_tokens": 1142009.0, "reward": 0.701923131942749, "reward_std": 0.5367909669876099, "rewards/fixed_code_pass_all_test_reward/mean": 0.20192308723926544, "rewards/fixed_code_pass_all_test_reward/std": 0.1158415898680687, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 334.875, "completions/mean_terminated_length": 334.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.023796347537354733, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.0020350948834675364, "learning_rate": 4.714548802946593e-06, "loss": 0.0001, "num_tokens": 1150192.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 293.625, "completions/mean_terminated_length": 293.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.023980815347721823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0162353515625, "kl": 0.0010835259381565265, "learning_rate": 4.751381215469614e-06, "loss": 0.0, "num_tokens": 1159229.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.024165283158088913, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.001431961492926348, "learning_rate": 4.788213627992634e-06, "loss": 0.0001, "num_tokens": 1167006.0, "reward": 1.7414772510528564, "reward_std": 0.309943825006485, "rewards/fixed_code_pass_all_test_reward/mean": 0.7414772510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.309943825006485, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 300.75, "completions/mean_terminated_length": 300.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.024349750968456003, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.0014671109602204524, "learning_rate": 4.825046040515654e-06, "loss": 0.0001, "num_tokens": 1172348.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 353.625, "completions/mean_terminated_length": 353.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.024534218778823096, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.0013779294131381903, "learning_rate": 4.861878453038674e-06, "loss": 0.0001, "num_tokens": 1181473.0, "reward": 0.6760203838348389, "reward_std": 0.8872564435005188, "rewards/fixed_code_pass_all_test_reward/mean": 0.30102041363716125, "rewards/fixed_code_pass_all_test_reward/std": 0.45382362604141235, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 453.5, "completions/mean_terminated_length": 453.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.024718686589190186, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "kl": 0.001224210391228553, "learning_rate": 4.898710865561695e-06, "loss": 0.0, "num_tokens": 1194485.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 470.125, "completions/mean_terminated_length": 470.125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.024903154399557276, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.0012207802610646468, "learning_rate": 4.935543278084715e-06, "loss": 0.0, "num_tokens": 1208638.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 551.5, "completions/mean_terminated_length": 551.5, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.025087622209924366, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0016762256200308912, "learning_rate": 4.972375690607736e-06, "loss": 0.0001, "num_tokens": 1220074.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 410.625, "completions/mean_terminated_length": 410.625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.02527209002029146, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.0008502814634994138, "learning_rate": 5.009208103130756e-06, "loss": 0.0, "num_tokens": 1228895.0, "reward": 0.90625, "reward_std": 0.376485139131546, "rewards/fixed_code_pass_all_test_reward/mean": 0.03125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.02545655783065855, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.0012290667436900549, "learning_rate": 5.046040515653776e-06, "loss": 0.0, "num_tokens": 1238832.0, "reward": 1.0129311084747314, "reward_std": 0.7234779000282288, "rewards/fixed_code_pass_all_test_reward/mean": 0.26293104887008667, "rewards/fixed_code_pass_all_test_reward/std": 0.39860787987709045, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 365.625, "completions/mean_terminated_length": 365.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.02564102564102564, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0017724489807733335, "learning_rate": 5.082872928176796e-06, "loss": 0.0001, "num_tokens": 1245749.0, "reward": 1.3333332538604736, "reward_std": 0.8809165954589844, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4779069423675537, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 510.125, "completions/mean_terminated_length": 510.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.025825493451392734, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0011779767446569167, "learning_rate": 5.119705340699817e-06, "loss": 0.0, "num_tokens": 1256654.0, "reward": 1.16847825050354, "reward_std": 0.39945781230926514, "rewards/fixed_code_pass_all_test_reward/mean": 0.29347825050354004, "rewards/fixed_code_pass_all_test_reward/std": 0.1589018553495407, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.026009961261759824, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0029732241164310835, "learning_rate": 5.156537753222836e-06, "loss": 0.0001, "num_tokens": 1262118.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.026194429072126914, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0012824747900594957, "learning_rate": 5.193370165745857e-06, "loss": 0.0001, "num_tokens": 1269680.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 548.75, "completions/mean_terminated_length": 334.5714416503906, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.026378896882494004, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.0011051225556002464, "learning_rate": 5.230202578268877e-06, "loss": 0.0, "num_tokens": 1279718.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.026563364692861097, "frac_reward_zero_std": 0.0, "grad_norm": 3.640625, "kl": 0.00512385499314405, "learning_rate": 5.267034990791897e-06, "loss": 0.0002, "num_tokens": 1284486.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.026747832503228187, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.004927866451907903, "learning_rate": 5.303867403314917e-06, "loss": 0.0002, "num_tokens": 1288458.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 436.75, "completions/mean_terminated_length": 436.75, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.026932300313595278, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.001333870146481786, "learning_rate": 5.340699815837938e-06, "loss": 0.0001, "num_tokens": 1300488.0, "reward": 0.9403408765792847, "reward_std": 0.3979243040084839, "rewards/fixed_code_pass_all_test_reward/mean": 0.06534090638160706, "rewards/fixed_code_pass_all_test_reward/std": 0.12114023417234421, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.027116768123962368, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.0024326640996150672, "learning_rate": 5.377532228360957e-06, "loss": 0.0001, "num_tokens": 1308651.0, "reward": 1.245833396911621, "reward_std": 0.6276530623435974, "rewards/fixed_code_pass_all_test_reward/mean": 0.3708333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.4037276804447174, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.02730123593432946, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.005778891441877931, "learning_rate": 5.414364640883978e-06, "loss": 0.0002, "num_tokens": 1313384.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.02748570374469655, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0045926355523988605, "learning_rate": 5.451197053406999e-06, "loss": 0.0002, "num_tokens": 1318189.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.02767017155506364, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.0029121311381459236, "learning_rate": 5.48802946593002e-06, "loss": 0.0001, "num_tokens": 1326033.0, "reward": 0.90625, "reward_std": 0.376485139131546, "rewards/fixed_code_pass_all_test_reward/mean": 0.03125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.02785463936543073, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.004956265045620967, "learning_rate": 5.524861878453039e-06, "loss": 0.0002, "num_tokens": 1336096.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.028039107175797825, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.0016653771017445251, "learning_rate": 5.56169429097606e-06, "loss": 0.0001, "num_tokens": 1347103.0, "reward": 0.7272727489471436, "reward_std": 0.4319036304950714, "rewards/fixed_code_pass_all_test_reward/mean": 0.10227273404598236, "rewards/fixed_code_pass_all_test_reward/std": 0.25453388690948486, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.028223574986164915, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.00633431707683485, "learning_rate": 5.59852670349908e-06, "loss": 0.0003, "num_tokens": 1351608.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 403.375, "completions/mean_terminated_length": 403.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.028408042796532005, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0014663313777418807, "learning_rate": 5.6353591160221e-06, "loss": 0.0001, "num_tokens": 1360763.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 618.0, "completions/mean_terminated_length": 618.0, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.028592510606899095, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.0016031799677875824, "learning_rate": 5.67219152854512e-06, "loss": 0.0001, "num_tokens": 1372747.0, "reward": 1.2352941036224365, "reward_std": 0.4941576421260834, "rewards/fixed_code_pass_all_test_reward/mean": 0.4852941036224365, "rewards/fixed_code_pass_all_test_reward/std": 0.234768345952034, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 365.125, "completions/mean_terminated_length": 365.125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.02877697841726619, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0017519237371743657, "learning_rate": 5.709023941068141e-06, "loss": 0.0001, "num_tokens": 1383412.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 455.75, "completions/mean_terminated_length": 455.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.02896144622763328, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.002970991758047603, "learning_rate": 5.74585635359116e-06, "loss": 0.0001, "num_tokens": 1392826.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 348.0, "completions/mean_terminated_length": 348.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.02914591403800037, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.002097335767757613, "learning_rate": 5.782688766114181e-06, "loss": 0.0001, "num_tokens": 1402306.0, "reward": 1.1451148986816406, "reward_std": 0.5559138059616089, "rewards/fixed_code_pass_all_test_reward/mean": 0.2701149582862854, "rewards/fixed_code_pass_all_test_reward/std": 0.3416382670402527, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.02933038184836746, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0016654674982419237, "learning_rate": 5.819521178637201e-06, "loss": 0.0001, "num_tokens": 1410331.0, "reward": 1.3494898080825806, "reward_std": 0.4327077865600586, "rewards/fixed_code_pass_all_test_reward/mean": 0.4744897782802582, "rewards/fixed_code_pass_all_test_reward/std": 0.3277582824230194, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.029514849658734552, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.010404160420875996, "learning_rate": 5.856353591160221e-06, "loss": 0.0004, "num_tokens": 1419285.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 403.0, "completions/mean_terminated_length": 403.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.029699317469101642, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.005802759988000616, "learning_rate": 5.893186003683242e-06, "loss": 0.0002, "num_tokens": 1428381.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.029883785279468732, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.00415519812668208, "learning_rate": 5.9300184162062626e-06, "loss": 0.0002, "num_tokens": 1434092.0, "reward": 1.0909091234207153, "reward_std": 0.4635472297668457, "rewards/fixed_code_pass_all_test_reward/mean": 0.21590909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.16789200901985168, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 108.625, "completions/mean_terminated_length": 108.625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.030068253089835822, "frac_reward_zero_std": 1.0, "grad_norm": 0.041748046875, "kl": 0.0034858378785429522, "learning_rate": 5.966850828729282e-06, "loss": 0.0001, "num_tokens": 1439441.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 468.375, "completions/mean_terminated_length": 468.375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.030252720900202916, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.0033775838383007795, "learning_rate": 6.003683241252303e-06, "loss": 0.0001, "num_tokens": 1447556.0, "reward": 1.125, "reward_std": 0.547480046749115, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.32141217589378357, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.030437188710570006, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.0035687207855517045, "learning_rate": 6.040515653775323e-06, "loss": 0.0001, "num_tokens": 1456738.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 372.375, "completions/mean_terminated_length": 372.375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.030621656520937096, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.0031436838617082685, "learning_rate": 6.077348066298343e-06, "loss": 0.0001, "num_tokens": 1465261.0, "reward": 1.40625, "reward_std": 0.11301689594984055, "rewards/fixed_code_pass_all_test_reward/mean": 0.40625, "rewards/fixed_code_pass_all_test_reward/std": 0.11301688104867935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 283.125, "completions/mean_terminated_length": 283.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.030806124331304186, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.0028306897438596934, "learning_rate": 6.114180478821363e-06, "loss": 0.0001, "num_tokens": 1473246.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.03099059214167128, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.007660612260224298, "learning_rate": 6.151012891344384e-06, "loss": 0.0003, "num_tokens": 1477152.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.03117505995203837, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.003915921231964603, "learning_rate": 6.187845303867403e-06, "loss": 0.0002, "num_tokens": 1486496.0, "reward": 1.048828125, "reward_std": 0.5084002614021301, "rewards/fixed_code_pass_all_test_reward/mean": 0.173828125, "rewards/fixed_code_pass_all_test_reward/std": 0.34571653604507446, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 249.375, "completions/mean_terminated_length": 249.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.03135952776240546, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.006079388782382011, "learning_rate": 6.224677716390424e-06, "loss": 0.0002, "num_tokens": 1492235.0, "reward": 1.079545497894287, "reward_std": 0.11331124603748322, "rewards/fixed_code_pass_all_test_reward/mean": 0.07954545319080353, "rewards/fixed_code_pass_all_test_reward/std": 0.11331122368574142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 344.25, "completions/mean_terminated_length": 344.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.03154399557277255, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.0038337640871759504, "learning_rate": 6.261510128913444e-06, "loss": 0.0002, "num_tokens": 1501533.0, "reward": 1.2291667461395264, "reward_std": 0.5700007081031799, "rewards/fixed_code_pass_all_test_reward/mean": 0.3541666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.38253021240234375, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 419.0, "completions/mean_terminated_length": 419.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.03172846338313964, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.005736910767154768, "learning_rate": 6.298342541436464e-06, "loss": 0.0002, "num_tokens": 1507805.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 319.0, "completions/mean_terminated_length": 319.0, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.03191293119350673, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.004880537773715332, "learning_rate": 6.335174953959485e-06, "loss": 0.0002, "num_tokens": 1514197.0, "reward": 1.649999976158142, "reward_std": 0.5529143214225769, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.18516401946544647, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 537.75, "completions/mean_terminated_length": 537.75, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.032097399003873824, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.002050281884294236, "learning_rate": 6.3720073664825055e-06, "loss": 0.0001, "num_tokens": 1530635.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.03228186681424092, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0030574371048714966, "learning_rate": 6.408839779005526e-06, "loss": 0.0001, "num_tokens": 1540456.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 374.125, "completions/mean_terminated_length": 374.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.032466334624608004, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.00890902552055195, "learning_rate": 6.445672191528546e-06, "loss": 0.0004, "num_tokens": 1548225.0, "reward": 1.0, "reward_std": 0.6546536684036255, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.0326508024349751, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.005207811002037488, "learning_rate": 6.482504604051566e-06, "loss": 0.0002, "num_tokens": 1555117.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 242.25, "completions/mean_terminated_length": 242.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.03283527024534219, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.00411636324133724, "learning_rate": 6.5193370165745865e-06, "loss": 0.0002, "num_tokens": 1563039.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 370.0, "completions/mean_terminated_length": 370.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.03301973805570928, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0039145527116488665, "learning_rate": 6.556169429097606e-06, "loss": 0.0002, "num_tokens": 1573927.0, "reward": 0.8611111640930176, "reward_std": 0.5657789707183838, "rewards/fixed_code_pass_all_test_reward/mean": 0.1111111119389534, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 279.5, "completions/mean_terminated_length": 279.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.03320420586607637, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.003935759901651181, "learning_rate": 6.5930018416206266e-06, "loss": 0.0002, "num_tokens": 1578883.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 340.375, "completions/mean_terminated_length": 340.375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.03338867367644346, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.008080898900516331, "learning_rate": 6.629834254143647e-06, "loss": 0.0003, "num_tokens": 1586430.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 285.75, "completions/mean_terminated_length": 285.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.03357314148681055, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.005378383022616617, "learning_rate": 6.666666666666667e-06, "loss": 0.0002, "num_tokens": 1595724.0, "reward": 0.954268217086792, "reward_std": 0.3403259813785553, "rewards/fixed_code_pass_all_test_reward/mean": 0.07926829159259796, "rewards/fixed_code_pass_all_test_reward/std": 0.054926421493291855, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 477.375, "completions/mean_terminated_length": 477.375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.033757609297177645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0184326171875, "kl": 0.002300884443684481, "learning_rate": 6.703499079189687e-06, "loss": 0.0001, "num_tokens": 1607399.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 439.875, "completions/mean_terminated_length": 210.1428680419922, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.03394207710754473, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.004054837863805005, "learning_rate": 6.740331491712708e-06, "loss": 0.0002, "num_tokens": 1613766.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 327.875, "completions/mean_terminated_length": 327.875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.034126544917911825, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.0030110061779851094, "learning_rate": 6.777163904235729e-06, "loss": 0.0001, "num_tokens": 1620349.0, "reward": 1.390625, "reward_std": 0.3746277689933777, "rewards/fixed_code_pass_all_test_reward/mean": 0.390625, "rewards/fixed_code_pass_all_test_reward/std": 0.3746277987957001, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 415.0, "completions/mean_terminated_length": 415.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.03431101272827892, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0035689854703377932, "learning_rate": 6.8139963167587485e-06, "loss": 0.0001, "num_tokens": 1628349.0, "reward": 1.058823585510254, "reward_std": 0.10892000794410706, "rewards/fixed_code_pass_all_test_reward/mean": 0.05882352963089943, "rewards/fixed_code_pass_all_test_reward/std": 0.10892001539468765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 311.75, "completions/mean_terminated_length": 311.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.034495480538646005, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.0025288845936302096, "learning_rate": 6.850828729281769e-06, "loss": 0.0001, "num_tokens": 1635715.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.0346799483490131, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.004095649841474369, "learning_rate": 6.8876611418047886e-06, "loss": 0.0002, "num_tokens": 1643521.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 348.875, "completions/mean_terminated_length": 348.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.034864416159380185, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.006737041345331818, "learning_rate": 6.924493554327809e-06, "loss": 0.0003, "num_tokens": 1653032.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 566.5, "completions/mean_terminated_length": 354.8571472167969, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.03504888396974728, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.0034460922179277986, "learning_rate": 6.9613259668508295e-06, "loss": 0.0001, "num_tokens": 1665844.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 327.125, "completions/mean_terminated_length": 327.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.03523335178011437, "frac_reward_zero_std": 1.0, "grad_norm": 0.032470703125, "kl": 0.0033368252916261554, "learning_rate": 6.998158379373849e-06, "loss": 0.0001, "num_tokens": 1671653.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.03541781959048146, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.013381881464738399, "learning_rate": 7.0349907918968695e-06, "loss": 0.0005, "num_tokens": 1682172.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.03560228740084855, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.003746343369130045, "learning_rate": 7.07182320441989e-06, "loss": 0.0001, "num_tokens": 1690206.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.035786755211215646, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.009934608824551105, "learning_rate": 7.10865561694291e-06, "loss": 0.0004, "num_tokens": 1694371.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 294.0, "completions/mean_terminated_length": 294.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.03597122302158273, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.014858316193567589, "learning_rate": 7.14548802946593e-06, "loss": 0.0006, "num_tokens": 1701163.0, "reward": 0.8870967626571655, "reward_std": 0.5752436518669128, "rewards/fixed_code_pass_all_test_reward/mean": 0.13709676265716553, "rewards/fixed_code_pass_all_test_reward/std": 0.19564911723136902, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 356.875, "completions/mean_terminated_length": 356.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.036155690831949826, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.006767428742023185, "learning_rate": 7.182320441988951e-06, "loss": 0.0003, "num_tokens": 1708922.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 258.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.03634015864231691, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.007978322595590726, "learning_rate": 7.219152854511972e-06, "loss": 0.0003, "num_tokens": 1714056.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.036524626452684006, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.006453410227550194, "learning_rate": 7.2559852670349914e-06, "loss": 0.0003, "num_tokens": 1721425.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.0367090942630511, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.022486777626909316, "learning_rate": 7.292817679558012e-06, "loss": 0.0009, "num_tokens": 1728255.0, "reward": 1.345588207244873, "reward_std": 0.5860779881477356, "rewards/fixed_code_pass_all_test_reward/mean": 0.4705882668495178, "rewards/fixed_code_pass_all_test_reward/std": 0.3429971933364868, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 393.875, "completions/mean_terminated_length": 393.875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.036893562073418186, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.004279178538126871, "learning_rate": 7.329650092081032e-06, "loss": 0.0002, "num_tokens": 1735630.0, "reward": 1.5357142686843872, "reward_std": 0.5050762891769409, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.4040610194206238, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 550.5, "completions/mean_terminated_length": 550.5, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.03707802988378528, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.003288590276497416, "learning_rate": 7.366482504604052e-06, "loss": 0.0001, "num_tokens": 1749698.0, "reward": 0.7857142686843872, "reward_std": 0.7130091190338135, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.40179988741874695, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.03726249769415237, "frac_reward_zero_std": 0.0, "grad_norm": 4.09375, "kl": 0.010952277603792027, "learning_rate": 7.4033149171270724e-06, "loss": 0.0004, "num_tokens": 1758056.0, "reward": 1.3875000476837158, "reward_std": 0.6194756031036377, "rewards/fixed_code_pass_all_test_reward/mean": 0.512499988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.33514389395713806, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 178.125, "completions/mean_terminated_length": 178.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.03744696550451946, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.007107935496605933, "learning_rate": 7.440147329650093e-06, "loss": 0.0003, "num_tokens": 1762441.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.03763143331488655, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.004974819399649277, "learning_rate": 7.4769797421731125e-06, "loss": 0.0002, "num_tokens": 1772240.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 391.25, "completions/mean_terminated_length": 391.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.03781590112525364, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759765625, "kl": 0.004130982037167996, "learning_rate": 7.513812154696133e-06, "loss": 0.0002, "num_tokens": 1780490.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.03800036893562073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.009049577231053263, "learning_rate": 7.5506445672191534e-06, "loss": 0.0004, "num_tokens": 1789467.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 303.75, "completions/mean_terminated_length": 303.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.03818483674598783, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.005089367710752413, "learning_rate": 7.587476979742173e-06, "loss": 0.0002, "num_tokens": 1794705.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 249.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.03836930455635491, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.008064881345489994, "learning_rate": 7.624309392265194e-06, "loss": 0.0003, "num_tokens": 1802119.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 279.375, "completions/mean_terminated_length": 279.375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.03855377236672201, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.00822117441566661, "learning_rate": 7.661141804788215e-06, "loss": 0.0003, "num_tokens": 1808250.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 593.125, "completions/mean_terminated_length": 593.125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.0387382401770891, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.004660998485633172, "learning_rate": 7.697974217311234e-06, "loss": 0.0002, "num_tokens": 1819411.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.03892270798745619, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.01024687581229955, "learning_rate": 7.734806629834256e-06, "loss": 0.0004, "num_tokens": 1824517.0, "reward": 0.875, "reward_std": 0.5230405926704407, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320515871048, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 247.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.03910717579782328, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.006571808335138485, "learning_rate": 7.771639042357275e-06, "loss": 0.0003, "num_tokens": 1830901.0, "reward": 1.278846263885498, "reward_std": 0.301972895860672, "rewards/fixed_code_pass_all_test_reward/mean": 0.2788461744785309, "rewards/fixed_code_pass_all_test_reward/std": 0.3019729554653168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.039291643608190374, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.015500773704843596, "learning_rate": 7.808471454880295e-06, "loss": 0.0006, "num_tokens": 1837314.0, "reward": 0.8857758045196533, "reward_std": 0.42821115255355835, "rewards/fixed_code_pass_all_test_reward/mean": 0.1357758641242981, "rewards/fixed_code_pass_all_test_reward/std": 0.09858758002519608, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 570.25, "completions/mean_terminated_length": 359.14288330078125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.03947611141855746, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.005336191155947745, "learning_rate": 7.845303867403316e-06, "loss": 0.0002, "num_tokens": 1846084.0, "reward": 1.524999976158142, "reward_std": 0.7086203694343567, "rewards/fixed_code_pass_all_test_reward/mean": 0.6499999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.43752554059028625, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 660.5, "completions/mean_terminated_length": 660.5, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.039660579228924554, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.0027420165861258283, "learning_rate": 7.882136279926336e-06, "loss": 0.0001, "num_tokens": 1861856.0, "reward": 1.0774999856948853, "reward_std": 0.21920308470726013, "rewards/fixed_code_pass_all_test_reward/mean": 0.07750000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.21920311450958252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.03984504703929164, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.007695890933973715, "learning_rate": 7.918968692449355e-06, "loss": 0.0003, "num_tokens": 1867468.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.040029514849658734, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.011165565505507402, "learning_rate": 7.955801104972377e-06, "loss": 0.0004, "num_tokens": 1872096.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.04021398266002583, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.009477649931795895, "learning_rate": 7.992633517495396e-06, "loss": 0.0004, "num_tokens": 1882233.0, "reward": 1.0208333730697632, "reward_std": 0.534867525100708, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333283662796, "rewards/fixed_code_pass_all_test_reward/std": 0.34556713700294495, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 365.875, "completions/mean_terminated_length": 365.875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.040398450470392915, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.004230408289004117, "learning_rate": 8.029465930018416e-06, "loss": 0.0002, "num_tokens": 1891856.0, "reward": 1.3461538553237915, "reward_std": 0.65141761302948, "rewards/fixed_code_pass_all_test_reward/mean": 0.4711538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.40586885809898376, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 525.5, "completions/mean_terminated_length": 525.5, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.04058291828076001, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.004400778474519029, "learning_rate": 8.066298342541437e-06, "loss": 0.0002, "num_tokens": 1902428.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 288.25, "completions/mean_terminated_length": 288.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.0407673860911271, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.00576471199747175, "learning_rate": 8.103130755064459e-06, "loss": 0.0002, "num_tokens": 1910878.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.04095185390149419, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.012682576023507863, "learning_rate": 8.139963167587478e-06, "loss": 0.0005, "num_tokens": 1914980.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 210.875, "completions/mean_terminated_length": 210.875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.04113632171186128, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.012095119658624753, "learning_rate": 8.176795580110498e-06, "loss": 0.0005, "num_tokens": 1919691.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.04132078952222837, "frac_reward_zero_std": 0.0, "grad_norm": 3.765625, "kl": 0.01657194524887018, "learning_rate": 8.213627992633517e-06, "loss": 0.0007, "num_tokens": 1923920.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.04150525733259546, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.00855798478005454, "learning_rate": 8.250460405156539e-06, "loss": 0.0003, "num_tokens": 1929190.0, "reward": 1.15625, "reward_std": 0.7270804047584534, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.34771791100502014, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.041689725142962555, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.008159032498952001, "learning_rate": 8.287292817679558e-06, "loss": 0.0003, "num_tokens": 1935791.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 574.75, "completions/mean_terminated_length": 364.2857360839844, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.04187419295332964, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.007861191697884351, "learning_rate": 8.324125230202578e-06, "loss": 0.0003, "num_tokens": 1946877.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.042058660763696736, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.015508275711908937, "learning_rate": 8.3609576427256e-06, "loss": 0.0006, "num_tokens": 1950903.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 386.375, "completions/mean_terminated_length": 386.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.04224312857406383, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.013374619622481987, "learning_rate": 8.397790055248619e-06, "loss": 0.0005, "num_tokens": 1963722.0, "reward": 1.7820247411727905, "reward_std": 0.3374994695186615, "rewards/fixed_code_pass_all_test_reward/mean": 0.7820247411727905, "rewards/fixed_code_pass_all_test_reward/std": 0.3374994993209839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.042427596384430916, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.007341455901041627, "learning_rate": 8.434622467771639e-06, "loss": 0.0003, "num_tokens": 1972210.0, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 345.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.04261206419479801, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.004687510838266462, "learning_rate": 8.47145488029466e-06, "loss": 0.0002, "num_tokens": 1979354.0, "reward": 0.9261363744735718, "reward_std": 0.3810269236564636, "rewards/fixed_code_pass_all_test_reward/mean": 0.05113636702299118, "rewards/fixed_code_pass_all_test_reward/std": 0.07464002817869186, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 471.25, "completions/mean_terminated_length": 471.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.042796532005165096, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.004036704529426061, "learning_rate": 8.508287292817681e-06, "loss": 0.0002, "num_tokens": 1993724.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.04298099981553219, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.008058118721237406, "learning_rate": 8.5451197053407e-06, "loss": 0.0003, "num_tokens": 2002538.0, "reward": 1.7693965435028076, "reward_std": 0.31088316440582275, "rewards/fixed_code_pass_all_test_reward/mean": 0.7693965435028076, "rewards/fixed_code_pass_all_test_reward/std": 0.31088316440582275, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 331.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.04316546762589928, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.011450041580246761, "learning_rate": 8.58195211786372e-06, "loss": 0.0005, "num_tokens": 2013370.0, "reward": 0.8789682388305664, "reward_std": 0.7450873255729675, "rewards/fixed_code_pass_all_test_reward/mean": 0.2539682686328888, "rewards/fixed_code_pass_all_test_reward/std": 0.3521827459335327, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.04334993543626637, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.010984100255882367, "learning_rate": 8.618784530386742e-06, "loss": 0.0004, "num_tokens": 2018459.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 251.875, "completions/mean_terminated_length": 251.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.04353440324663346, "frac_reward_zero_std": 0.0, "grad_norm": 4.03125, "kl": 0.009617811942007393, "learning_rate": 8.655616942909761e-06, "loss": 0.0004, "num_tokens": 2024778.0, "reward": 1.568750023841858, "reward_std": 0.27894893288612366, "rewards/fixed_code_pass_all_test_reward/mean": 0.5687500238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.27894893288612366, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.043718871057000556, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.011850156530272216, "learning_rate": 8.692449355432781e-06, "loss": 0.0005, "num_tokens": 2032971.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.04390333886736764, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "kl": 0.01781626994488761, "learning_rate": 8.729281767955802e-06, "loss": 0.0007, "num_tokens": 2036453.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.04408780667773474, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.007998431363375857, "learning_rate": 8.766114180478822e-06, "loss": 0.0003, "num_tokens": 2044151.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 282.75, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.04427227448810182, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.011229310533963144, "learning_rate": 8.802946593001841e-06, "loss": 0.0004, "num_tokens": 2051021.0, "reward": 1.4886362552642822, "reward_std": 0.35934969782829285, "rewards/fixed_code_pass_all_test_reward/mean": 0.4886363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.3593497574329376, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.04445674229846892, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.009351369750220329, "learning_rate": 8.839779005524863e-06, "loss": 0.0004, "num_tokens": 2057772.0, "reward": 0.9120370149612427, "reward_std": 0.36874645948410034, "rewards/fixed_code_pass_all_test_reward/mean": 0.03703703731298447, "rewards/fixed_code_pass_all_test_reward/std": 0.019797129556536674, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.04464121010883601, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.014665447932202369, "learning_rate": 8.876611418047882e-06, "loss": 0.0006, "num_tokens": 2063198.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 414.75, "completions/mean_terminated_length": 414.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0448256779192031, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.004940654267556965, "learning_rate": 8.913443830570904e-06, "loss": 0.0002, "num_tokens": 2074084.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.04501014572957019, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.007953379856189713, "learning_rate": 8.950276243093923e-06, "loss": 0.0003, "num_tokens": 2082220.0, "reward": 1.4934895038604736, "reward_std": 0.7302062511444092, "rewards/fixed_code_pass_all_test_reward/mean": 0.6184895634651184, "rewards/fixed_code_pass_all_test_reward/std": 0.48113393783569336, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.045194613539937284, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.016704957117326558, "learning_rate": 8.987108655616945e-06, "loss": 0.0007, "num_tokens": 2089847.0, "reward": 1.6130952835083008, "reward_std": 0.46825870871543884, "rewards/fixed_code_pass_all_test_reward/mean": 0.613095223903656, "rewards/fixed_code_pass_all_test_reward/std": 0.468258798122406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.04537908135030437, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.011117411311715841, "learning_rate": 9.023941068139964e-06, "loss": 0.0004, "num_tokens": 2099113.0, "reward": 1.1607142686843872, "reward_std": 0.09155284613370895, "rewards/fixed_code_pass_all_test_reward/mean": 0.1607142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.09155285358428955, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 263.875, "completions/mean_terminated_length": 263.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.045563549160671464, "frac_reward_zero_std": 0.0, "grad_norm": 3.8125, "kl": 0.014848415623418987, "learning_rate": 9.060773480662984e-06, "loss": 0.0006, "num_tokens": 2107920.0, "reward": 1.53125, "reward_std": 0.6605936288833618, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.04574801697103855, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.008733909227885306, "learning_rate": 9.097605893186005e-06, "loss": 0.0003, "num_tokens": 2113974.0, "reward": 1.2916666269302368, "reward_std": 0.5326632261276245, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.36731547117233276, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 181.5, "completions/mean_terminated_length": 181.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.045932484781405644, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.0165499874856323, "learning_rate": 9.134438305709025e-06, "loss": 0.0007, "num_tokens": 2119338.0, "reward": 1.24609375, "reward_std": 0.6964589953422546, "rewards/fixed_code_pass_all_test_reward/mean": 0.49609375, "rewards/fixed_code_pass_all_test_reward/std": 0.34816062450408936, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 537.25, "completions/mean_terminated_length": 537.25, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.04611695259177274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0296630859375, "kl": 0.0035085025301668793, "learning_rate": 9.171270718232044e-06, "loss": 0.0001, "num_tokens": 2130220.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 398.5, "completions/mean_terminated_length": 398.5, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.046301420402139824, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.006344130146317184, "learning_rate": 9.208103130755066e-06, "loss": 0.0003, "num_tokens": 2137504.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.04648588821250692, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.015479651628993452, "learning_rate": 9.244935543278085e-06, "loss": 0.0006, "num_tokens": 2141569.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 114.25, "completions/mean_terminated_length": 114.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.04667035602287401, "frac_reward_zero_std": 0.0, "grad_norm": 4.375, "kl": 0.019862066605128348, "learning_rate": 9.281767955801105e-06, "loss": 0.0008, "num_tokens": 2145211.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 350.625, "completions/mean_terminated_length": 350.625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.0468548238332411, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.004922383319353685, "learning_rate": 9.318600368324126e-06, "loss": 0.0002, "num_tokens": 2153360.0, "reward": 0.8803571462631226, "reward_std": 0.35587018728256226, "rewards/fixed_code_pass_all_test_reward/mean": 0.0053571430034935474, "rewards/fixed_code_pass_all_test_reward/std": 0.010628911666572094, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.04703929164360819, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.010047366027720273, "learning_rate": 9.355432780847147e-06, "loss": 0.0004, "num_tokens": 2157913.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.04722375945397528, "frac_reward_zero_std": 1.0, "grad_norm": 0.044921875, "kl": 0.007750129705527797, "learning_rate": 9.392265193370167e-06, "loss": 0.0003, "num_tokens": 2164678.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.04740822726434237, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.012835880101192743, "learning_rate": 9.429097605893187e-06, "loss": 0.0005, "num_tokens": 2172558.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 169.875, "completions/mean_terminated_length": 169.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.047592695074709465, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.017090481298509985, "learning_rate": 9.465930018416208e-06, "loss": 0.0007, "num_tokens": 2179853.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.04777716288507655, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.01581138389883563, "learning_rate": 9.502762430939228e-06, "loss": 0.0006, "num_tokens": 2183959.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.047961630695443645, "frac_reward_zero_std": 0.0, "grad_norm": 4.71875, "kl": 0.0409209078643471, "learning_rate": 9.539594843462247e-06, "loss": 0.0016, "num_tokens": 2187773.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 503.0, "completions/mean_terminated_length": 503.0, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.04814609850581074, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.004694517090683803, "learning_rate": 9.576427255985269e-06, "loss": 0.0002, "num_tokens": 2197349.0, "reward": 1.8214285373687744, "reward_std": 0.16642355918884277, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.16642354428768158, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.048330566316177825, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.02295152225997299, "learning_rate": 9.613259668508288e-06, "loss": 0.0009, "num_tokens": 2206817.0, "reward": 1.483630895614624, "reward_std": 0.460407555103302, "rewards/fixed_code_pass_all_test_reward/mean": 0.6086309552192688, "rewards/fixed_code_pass_all_test_reward/std": 0.4142269492149353, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 135.25, "completions/mean_terminated_length": 135.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.04851503412654492, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.03507366660051048, "learning_rate": 9.650092081031308e-06, "loss": 0.0014, "num_tokens": 2213691.0, "reward": 1.774999976158142, "reward_std": 0.24053514003753662, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.24053511023521423, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.048699501936912006, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.016384446062147617, "learning_rate": 9.686924493554329e-06, "loss": 0.0007, "num_tokens": 2221771.0, "reward": 1.301829218864441, "reward_std": 0.12015223503112793, "rewards/fixed_code_pass_all_test_reward/mean": 0.3018292784690857, "rewards/fixed_code_pass_all_test_reward/std": 0.12015224248170853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 213.375, "completions/mean_terminated_length": 213.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.0488839697472791, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.00623988505685702, "learning_rate": 9.723756906077349e-06, "loss": 0.0002, "num_tokens": 2226742.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 389.25, "completions/mean_terminated_length": 389.25, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.04906843755764619, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.00676361596561037, "learning_rate": 9.760589318600368e-06, "loss": 0.0003, "num_tokens": 2234848.0, "reward": 1.4375, "reward_std": 0.20280565321445465, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.20280568301677704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 437.375, "completions/mean_terminated_length": 437.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.04925290536801328, "frac_reward_zero_std": 1.0, "grad_norm": 0.02392578125, "kl": 0.0036239378387108445, "learning_rate": 9.79742173112339e-06, "loss": 0.0001, "num_tokens": 2247075.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.04943737317838037, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.017043095140252262, "learning_rate": 9.834254143646411e-06, "loss": 0.0007, "num_tokens": 2256586.0, "reward": 1.100961446762085, "reward_std": 0.04079460725188255, "rewards/fixed_code_pass_all_test_reward/mean": 0.10096153616905212, "rewards/fixed_code_pass_all_test_reward/std": 0.04079462215304375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.049621840988747466, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.010467794520081952, "learning_rate": 9.87108655616943e-06, "loss": 0.0004, "num_tokens": 2261658.0, "reward": 1.8928570747375488, "reward_std": 0.30304577946662903, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.30304577946662903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.04980630879911455, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005859375, "kl": 0.008148560184054077, "learning_rate": 9.90791896869245e-06, "loss": 0.0003, "num_tokens": 2266840.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.049990776609481646, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.015943627164233476, "learning_rate": 9.944751381215471e-06, "loss": 0.0006, "num_tokens": 2273989.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.05017524441984873, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.01634393830318004, "learning_rate": 9.981583793738491e-06, "loss": 0.0007, "num_tokens": 2283274.0, "reward": 1.0625, "reward_std": 0.6781013607978821, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 395.75, "completions/mean_terminated_length": 395.75, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.050359712230215826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0576171875, "kl": 0.009377073729410768, "learning_rate": 1.0018416206261512e-05, "loss": 0.0004, "num_tokens": 2291312.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 215.25, "completions/mean_terminated_length": 215.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.05054418004058292, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.021001623012125492, "learning_rate": 1.0055248618784532e-05, "loss": 0.0008, "num_tokens": 2300226.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 637.875, "completions/mean_terminated_length": 637.875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.05072864785095001, "frac_reward_zero_std": 1.0, "grad_norm": 0.047607421875, "kl": 0.007694698870182037, "learning_rate": 1.0092081031307552e-05, "loss": 0.0003, "num_tokens": 2311953.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 236.0, "completions/mean_terminated_length": 236.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.0509131156613171, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.013247085793409497, "learning_rate": 1.0128913443830573e-05, "loss": 0.0005, "num_tokens": 2318097.0, "reward": 1.148936152458191, "reward_std": 0.4272003769874573, "rewards/fixed_code_pass_all_test_reward/mean": 0.2739361822605133, "rewards/fixed_code_pass_all_test_reward/std": 0.20005740225315094, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 526.125, "completions/mean_terminated_length": 526.125, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.051097583471684194, "frac_reward_zero_std": 1.0, "grad_norm": 0.03955078125, "kl": 0.005961827235296369, "learning_rate": 1.0165745856353592e-05, "loss": 0.0002, "num_tokens": 2329538.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 347.0, "completions/mean_terminated_length": 347.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.05128205128205128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.00815270227030851, "learning_rate": 1.0202578268876612e-05, "loss": 0.0003, "num_tokens": 2338642.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.051466519092418374, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.026112373801879585, "learning_rate": 1.0239410681399633e-05, "loss": 0.001, "num_tokens": 2343000.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.05165098690278547, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.024990892969071865, "learning_rate": 1.0276243093922653e-05, "loss": 0.001, "num_tokens": 2354949.0, "reward": 1.394230842590332, "reward_std": 0.1602381020784378, "rewards/fixed_code_pass_all_test_reward/mean": 0.39423078298568726, "rewards/fixed_code_pass_all_test_reward/std": 0.1602380871772766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.051835454713152554, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.02688002015929669, "learning_rate": 1.0313075506445673e-05, "loss": 0.0011, "num_tokens": 2358910.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 118.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.05201992252351965, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.013506916991900653, "learning_rate": 1.0349907918968694e-05, "loss": 0.0005, "num_tokens": 2362666.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.052204390333886734, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.020774402422830462, "learning_rate": 1.0386740331491714e-05, "loss": 0.0008, "num_tokens": 2372991.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 245.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.05238885814425383, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.007869014516472816, "learning_rate": 1.0423572744014733e-05, "loss": 0.0003, "num_tokens": 2379653.0, "reward": 1.6363636255264282, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 313.125, "completions/mean_terminated_length": 313.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.05257332595462092, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.010908835858572274, "learning_rate": 1.0460405156537754e-05, "loss": 0.0004, "num_tokens": 2391278.0, "reward": 1.3048779964447021, "reward_std": 0.4468904435634613, "rewards/fixed_code_pass_all_test_reward/mean": 0.3048780560493469, "rewards/fixed_code_pass_all_test_reward/std": 0.4468904137611389, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 209.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.05275779376498801, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.013380130869336426, "learning_rate": 1.0497237569060774e-05, "loss": 0.0005, "num_tokens": 2395851.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.0529422615753551, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.01931053481530398, "learning_rate": 1.0534069981583794e-05, "loss": 0.0008, "num_tokens": 2401180.0, "reward": 1.8333332538604736, "reward_std": 0.40824824571609497, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.07715168595314026, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.053126729385722195, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.025172464665956795, "learning_rate": 1.0570902394106815e-05, "loss": 0.001, "num_tokens": 2405281.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 108.875, "completions/mean_terminated_length": 108.875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.05331119719608928, "frac_reward_zero_std": 0.0, "grad_norm": 4.15625, "kl": 0.02612029400188476, "learning_rate": 1.0607734806629835e-05, "loss": 0.001, "num_tokens": 2408936.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 308.25, "completions/mean_terminated_length": 308.25, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.053495665006456375, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.005354907625587657, "learning_rate": 1.0644567219152854e-05, "loss": 0.0002, "num_tokens": 2416322.0, "reward": 1.4940476417541504, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.6190476417541504, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 135.75, "completions/mean_terminated_length": 135.75, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.05368013281682346, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.026078032213263214, "learning_rate": 1.0681399631675875e-05, "loss": 0.001, "num_tokens": 2420256.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 135.75, "completions/mean_terminated_length": 135.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.053864600627190555, "frac_reward_zero_std": 0.0, "grad_norm": 3.328125, "kl": 0.03338433289900422, "learning_rate": 1.0718232044198895e-05, "loss": 0.0013, "num_tokens": 2424118.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 230.0, "completions/mean_terminated_length": 230.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.05404906843755765, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.01580487343017012, "learning_rate": 1.0755064456721915e-05, "loss": 0.0006, "num_tokens": 2434230.0, "reward": 1.818750023841858, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.8187500238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.054233536247924735, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.024768620496615767, "learning_rate": 1.0791896869244936e-05, "loss": 0.001, "num_tokens": 2441712.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 102.5, "completions/mean_terminated_length": 102.5, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.05441800405829183, "frac_reward_zero_std": 1.0, "grad_norm": 0.322265625, "kl": 0.04006415093317628, "learning_rate": 1.0828729281767956e-05, "loss": 0.0016, "num_tokens": 2449780.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.05460247186865892, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.02386624866630882, "learning_rate": 1.0865561694290975e-05, "loss": 0.001, "num_tokens": 2455196.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 313.375, "completions/mean_terminated_length": 313.375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.05478693967902601, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.009536665282212198, "learning_rate": 1.0902394106813998e-05, "loss": 0.0004, "num_tokens": 2462991.0, "reward": 0.8571428656578064, "reward_std": 0.9202600717544556, "rewards/fixed_code_pass_all_test_reward/mean": 0.3571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3911534249782562, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.0549714074893931, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.01910259947180748, "learning_rate": 1.0939226519337018e-05, "loss": 0.0008, "num_tokens": 2468286.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 104.75, "completions/mean_terminated_length": 104.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.05515587529976019, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.03134882263839245, "learning_rate": 1.097605893186004e-05, "loss": 0.0013, "num_tokens": 2474492.0, "reward": 1.814814805984497, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8148148059844971, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.05534034311012728, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.02588577044662088, "learning_rate": 1.1012891344383059e-05, "loss": 0.001, "num_tokens": 2479160.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 198.5, "completions/mean_terminated_length": 198.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.055524810920494376, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.014305709628388286, "learning_rate": 1.1049723756906078e-05, "loss": 0.0006, "num_tokens": 2488244.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 207.125, "completions/mean_terminated_length": 207.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.05570927873086146, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.013118399772793055, "learning_rate": 1.10865561694291e-05, "loss": 0.0005, "num_tokens": 2496053.0, "reward": 1.0208332538604736, "reward_std": 0.05892553552985191, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.055893746541228556, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.012167995097115636, "learning_rate": 1.112338858195212e-05, "loss": 0.0005, "num_tokens": 2505030.0, "reward": 1.0904254913330078, "reward_std": 0.17792190611362457, "rewards/fixed_code_pass_all_test_reward/mean": 0.0904255360364914, "rewards/fixed_code_pass_all_test_reward/std": 0.17792193591594696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.05607821435159565, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.021112728165462613, "learning_rate": 1.1160220994475139e-05, "loss": 0.0008, "num_tokens": 2510488.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 405.875, "completions/mean_terminated_length": 405.875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.056262682161962736, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.007208537484984845, "learning_rate": 1.119705340699816e-05, "loss": 0.0003, "num_tokens": 2521567.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 436.625, "completions/mean_terminated_length": 436.625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.05644714997232983, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.005880947690457106, "learning_rate": 1.123388581952118e-05, "loss": 0.0002, "num_tokens": 2532420.0, "reward": 1.1315789222717285, "reward_std": 0.3132731318473816, "rewards/fixed_code_pass_all_test_reward/mean": 0.1315789520740509, "rewards/fixed_code_pass_all_test_reward/std": 0.31327319145202637, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.056631617782696916, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.016911669896217063, "learning_rate": 1.12707182320442e-05, "loss": 0.0007, "num_tokens": 2541281.0, "reward": 1.0, "reward_std": 0.4364357888698578, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.17251639068126678, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.05681608559306401, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.012866285513155162, "learning_rate": 1.130755064456722e-05, "loss": 0.0005, "num_tokens": 2547022.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 108.0, "completions/mean_terminated_length": 108.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.0570005534034311, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.06013664184138179, "learning_rate": 1.134438305709024e-05, "loss": 0.0024, "num_tokens": 2550654.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 71.125, "completions/mean_terminated_length": 71.125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.05718502121379819, "frac_reward_zero_std": 0.0, "grad_norm": 4.375, "kl": 0.08321865368634462, "learning_rate": 1.138121546961326e-05, "loss": 0.0033, "num_tokens": 2554079.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 558.625, "completions/mean_terminated_length": 345.8571472167969, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.057369489024165284, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.010555637301877141, "learning_rate": 1.1418047882136281e-05, "loss": 0.0004, "num_tokens": 2563956.0, "reward": 1.1490384340286255, "reward_std": 0.6015573740005493, "rewards/fixed_code_pass_all_test_reward/mean": 0.2740384638309479, "rewards/fixed_code_pass_all_test_reward/std": 0.39821431040763855, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.05755395683453238, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.009426304837688804, "learning_rate": 1.1454880294659301e-05, "loss": 0.0004, "num_tokens": 2572138.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.057738424644899464, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.019806429860182106, "learning_rate": 1.149171270718232e-05, "loss": 0.0008, "num_tokens": 2578461.0, "reward": 1.75, "reward_std": 0.34503278136253357, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.17251639068126678, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 476.375, "completions/mean_terminated_length": 476.375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.05792289245526656, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.010532338928896934, "learning_rate": 1.1528545119705342e-05, "loss": 0.0004, "num_tokens": 2588888.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.058107360265633644, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.022850419278256595, "learning_rate": 1.1565377532228361e-05, "loss": 0.0009, "num_tokens": 2593720.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 405.25, "completions/mean_terminated_length": 405.25, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.05829182807600074, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.011339049378875643, "learning_rate": 1.1602209944751381e-05, "loss": 0.0005, "num_tokens": 2601666.0, "reward": 1.0, "reward_std": 0.7183768153190613, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.39865073561668396, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 244.375, "completions/mean_terminated_length": 244.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.05847629588636783, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.019974448368884623, "learning_rate": 1.1639042357274402e-05, "loss": 0.0008, "num_tokens": 2608149.0, "reward": 1.375, "reward_std": 0.36403128504753113, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.3640313446521759, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 147.125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.05866076369673492, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.029570457292720675, "learning_rate": 1.1675874769797422e-05, "loss": 0.0012, "num_tokens": 2612350.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 111.875, "completions/mean_terminated_length": 111.875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.05884523150710201, "frac_reward_zero_std": 0.0, "grad_norm": 3.328125, "kl": 0.015173492138274014, "learning_rate": 1.1712707182320442e-05, "loss": 0.0006, "num_tokens": 2618021.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 157.375, "completions/mean_terminated_length": 157.375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.059029699317469105, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.012732222850900143, "learning_rate": 1.1749539594843465e-05, "loss": 0.0005, "num_tokens": 2625392.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 473.875, "completions/mean_terminated_length": 473.875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.05921416712783619, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0056219008401967585, "learning_rate": 1.1786372007366484e-05, "loss": 0.0002, "num_tokens": 2637903.0, "reward": 1.4666666984558105, "reward_std": 0.33046382665634155, "rewards/fixed_code_pass_all_test_reward/mean": 0.46666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.33046385645866394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 321.0, "completions/mean_terminated_length": 321.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.059398634938203285, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.009614054462872446, "learning_rate": 1.1823204419889504e-05, "loss": 0.0004, "num_tokens": 2646791.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 103.375, "completions/mean_terminated_length": 103.375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.05958310274857037, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "kl": 0.06595568917691708, "learning_rate": 1.1860036832412525e-05, "loss": 0.0026, "num_tokens": 2650530.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.059767570558937465, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.013308381894603372, "learning_rate": 1.1896869244935545e-05, "loss": 0.0005, "num_tokens": 2659329.0, "reward": 1.1428570747375488, "reward_std": 0.3499270975589752, "rewards/fixed_code_pass_all_test_reward/mean": 0.1428571492433548, "rewards/fixed_code_pass_all_test_reward/std": 0.3499271273612976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 510.5, "completions/mean_terminated_length": 510.5, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.05995203836930456, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.00817285745870322, "learning_rate": 1.1933701657458564e-05, "loss": 0.0003, "num_tokens": 2672533.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.060136506179671645, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.03314072312787175, "learning_rate": 1.1970534069981586e-05, "loss": 0.0013, "num_tokens": 2676417.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.06032097399003874, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.015122409036848694, "learning_rate": 1.2007366482504605e-05, "loss": 0.0006, "num_tokens": 2682664.0, "reward": 1.6931817531585693, "reward_std": 0.3109094500541687, "rewards/fixed_code_pass_all_test_reward/mean": 0.6931818723678589, "rewards/fixed_code_pass_all_test_reward/std": 0.3109094798564911, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.06050544180040583, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.012738097953842953, "learning_rate": 1.2044198895027625e-05, "loss": 0.0005, "num_tokens": 2690748.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.06068990961077292, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.022641177638433874, "learning_rate": 1.2081031307550646e-05, "loss": 0.0009, "num_tokens": 2701122.0, "reward": 1.3181818723678589, "reward_std": 0.42916131019592285, "rewards/fixed_code_pass_all_test_reward/mean": 0.3181818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.42916133999824524, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.06087437742114001, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.016380450048018247, "learning_rate": 1.2117863720073666e-05, "loss": 0.0007, "num_tokens": 2710526.0, "reward": 1.8204545974731445, "reward_std": 0.1227153092622757, "rewards/fixed_code_pass_all_test_reward/mean": 0.8204545378684998, "rewards/fixed_code_pass_all_test_reward/std": 0.12271526455879211, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.0610588452315071, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.015487940516322851, "learning_rate": 1.2154696132596685e-05, "loss": 0.0006, "num_tokens": 2716125.0, "reward": 1.7999999523162842, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.7999999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.2828426957130432, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 394.875, "completions/mean_terminated_length": 394.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.06124331304187419, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.012470854504499584, "learning_rate": 1.2191528545119707e-05, "loss": 0.0005, "num_tokens": 2726564.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 230.375, "completions/mean_terminated_length": 230.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.061427780852241286, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.01921490696258843, "learning_rate": 1.2228360957642726e-05, "loss": 0.0008, "num_tokens": 2732639.0, "reward": 1.125, "reward_std": 0.3184136748313904, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.318413645029068, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 728.75, "completions/mean_terminated_length": 728.75, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.06161224866260837, "frac_reward_zero_std": 0.0, "grad_norm": 0.53515625, "kl": 0.007766801572870463, "learning_rate": 1.2265193370165746e-05, "loss": 0.0003, "num_tokens": 2755997.0, "reward": 1.04347825050354, "reward_std": 0.31438636779785156, "rewards/fixed_code_pass_all_test_reward/mean": 0.16847826540470123, "rewards/fixed_code_pass_all_test_reward/std": 0.232522115111351, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.061796716472975466, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.02139846934005618, "learning_rate": 1.2302025782688767e-05, "loss": 0.0009, "num_tokens": 2761508.0, "reward": 1.377500057220459, "reward_std": 0.5155233144760132, "rewards/fixed_code_pass_all_test_reward/mean": 0.3774999976158142, "rewards/fixed_code_pass_all_test_reward/std": 0.5155233144760132, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 405.5, "completions/mean_terminated_length": 405.5, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.06198118428334256, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.020631293824408203, "learning_rate": 1.2338858195211787e-05, "loss": 0.0008, "num_tokens": 2774664.0, "reward": 1.5113636255264282, "reward_std": 0.5232211351394653, "rewards/fixed_code_pass_all_test_reward/mean": 0.5113636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.5232211351394653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 105.875, "completions/mean_terminated_length": 105.875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.062165652093709646, "frac_reward_zero_std": 1.0, "grad_norm": 0.291015625, "kl": 0.04452067380771041, "learning_rate": 1.2375690607734806e-05, "loss": 0.0018, "num_tokens": 2778231.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 338.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.06235011990407674, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.011730821861419827, "learning_rate": 1.2412523020257828e-05, "loss": 0.0005, "num_tokens": 2788112.0, "reward": 1.125, "reward_std": 0.4527692496776581, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106813788414, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 237.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.06253458771444383, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.02893826726358384, "learning_rate": 1.2449355432780847e-05, "loss": 0.0012, "num_tokens": 2797247.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 316.25, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.06271905552481093, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.01831023022532463, "learning_rate": 1.2486187845303867e-05, "loss": 0.0007, "num_tokens": 2804729.0, "reward": 1.5859375, "reward_std": 0.3078900873661041, "rewards/fixed_code_pass_all_test_reward/mean": 0.5859375, "rewards/fixed_code_pass_all_test_reward/std": 0.3078901171684265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 380.125, "completions/mean_terminated_length": 380.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.06290352333517801, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.014041322923731059, "learning_rate": 1.2523020257826888e-05, "loss": 0.0006, "num_tokens": 2817474.0, "reward": 1.0775861740112305, "reward_std": 0.5248213410377502, "rewards/fixed_code_pass_all_test_reward/mean": 0.20258620381355286, "rewards/fixed_code_pass_all_test_reward/std": 0.3042295575141907, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 501.375, "completions/mean_terminated_length": 501.375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.0630879911455451, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.013173687737435102, "learning_rate": 1.2559852670349908e-05, "loss": 0.0005, "num_tokens": 2831965.0, "reward": 0.637499988079071, "reward_std": 0.5289815068244934, "rewards/fixed_code_pass_all_test_reward/mean": 0.012500000186264515, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.06327245895591219, "frac_reward_zero_std": 1.0, "grad_norm": 0.287109375, "kl": 0.027352083940058947, "learning_rate": 1.2596685082872928e-05, "loss": 0.0011, "num_tokens": 2836666.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 441.5, "completions/mean_terminated_length": 441.5, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.06345692676627929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0546875, "kl": 0.010934065794572234, "learning_rate": 1.263351749539595e-05, "loss": 0.0004, "num_tokens": 2846118.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.06364139457664637, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.02658699546009302, "learning_rate": 1.267034990791897e-05, "loss": 0.0011, "num_tokens": 2852395.0, "reward": 1.6590909957885742, "reward_std": 0.23402664065361023, "rewards/fixed_code_pass_all_test_reward/mean": 0.6590909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.23402665555477142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 326.375, "completions/mean_terminated_length": 326.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.06382586238701346, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.02411665755789727, "learning_rate": 1.2707182320441991e-05, "loss": 0.001, "num_tokens": 2860734.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.06401033019738056, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.021102301427163184, "learning_rate": 1.2744014732965011e-05, "loss": 0.0008, "num_tokens": 2868508.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 394.75, "completions/mean_terminated_length": 394.75, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.06419479800774765, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.012544538651127368, "learning_rate": 1.278084714548803e-05, "loss": 0.0005, "num_tokens": 2877490.0, "reward": 1.6979166269302368, "reward_std": 0.34628844261169434, "rewards/fixed_code_pass_all_test_reward/mean": 0.6979166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.34628844261169434, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 571.0, "completions/mean_terminated_length": 571.0, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.06437926581811473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.017382360063493252, "learning_rate": 1.2817679558011052e-05, "loss": 0.0007, "num_tokens": 2892002.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 327.75, "completions/mean_terminated_length": 327.75, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.06456373362848183, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.018179696518927813, "learning_rate": 1.2854511970534072e-05, "loss": 0.0007, "num_tokens": 2899560.0, "reward": 1.0972223281860352, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.0972222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.06474820143884892, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.02552419825224206, "learning_rate": 1.2891344383057091e-05, "loss": 0.001, "num_tokens": 2904002.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 344.375, "completions/mean_terminated_length": 344.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.06493266924921601, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.02515380212571472, "learning_rate": 1.2928176795580112e-05, "loss": 0.001, "num_tokens": 2912381.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 131.75, "completions/mean_terminated_length": 131.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.06511713705958311, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.04765529604628682, "learning_rate": 1.2965009208103132e-05, "loss": 0.0019, "num_tokens": 2917115.0, "reward": 1.7784090042114258, "reward_std": 0.31250739097595215, "rewards/fixed_code_pass_all_test_reward/mean": 0.7784091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.31250739097595215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 157.75, "completions/mean_terminated_length": 157.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.0653016048699502, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.02578783524222672, "learning_rate": 1.3001841620626152e-05, "loss": 0.001, "num_tokens": 2921905.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.06548607268031728, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.020696112071163952, "learning_rate": 1.3038674033149173e-05, "loss": 0.0008, "num_tokens": 2926172.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 494.625, "completions/mean_terminated_length": 494.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.06567054049068438, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.02559791225939989, "learning_rate": 1.3075506445672193e-05, "loss": 0.001, "num_tokens": 2938089.0, "reward": 1.2374999523162842, "reward_std": 0.5069164633750916, "rewards/fixed_code_pass_all_test_reward/mean": 0.36249998211860657, "rewards/fixed_code_pass_all_test_reward/std": 0.33779749274253845, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 349.0, "completions/mean_terminated_length": 349.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.06585500830105147, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.023035830236040056, "learning_rate": 1.3112338858195212e-05, "loss": 0.0009, "num_tokens": 2947465.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 299.625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.06603947611141855, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.020817721378989518, "learning_rate": 1.3149171270718234e-05, "loss": 0.0008, "num_tokens": 2955038.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.06622394392178566, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.04343878570944071, "learning_rate": 1.3186003683241253e-05, "loss": 0.0017, "num_tokens": 2963945.0, "reward": 1.985714316368103, "reward_std": 0.040406085550785065, "rewards/fixed_code_pass_all_test_reward/mean": 0.985714316368103, "rewards/fixed_code_pass_all_test_reward/std": 0.04040610045194626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.06640841173215274, "frac_reward_zero_std": 1.0, "grad_norm": 0.193359375, "kl": 0.03615026455372572, "learning_rate": 1.3222836095764273e-05, "loss": 0.0014, "num_tokens": 2971741.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 524.625, "completions/mean_terminated_length": 524.625, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.06659287954251983, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.010908561176620424, "learning_rate": 1.3259668508287294e-05, "loss": 0.0004, "num_tokens": 2988378.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 157.125, "completions/mean_terminated_length": 157.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.06677734735288691, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.06569674680940807, "learning_rate": 1.3296500920810314e-05, "loss": 0.0026, "num_tokens": 2992555.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.06696181516325402, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.041602545883506536, "learning_rate": 1.3333333333333333e-05, "loss": 0.0017, "num_tokens": 2999999.0, "reward": 1.9583332538604736, "reward_std": 0.016835885122418404, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.016835875809192657, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 375.875, "completions/mean_terminated_length": 375.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.0671462829736211, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.01394475408596918, "learning_rate": 1.3370165745856355e-05, "loss": 0.0006, "num_tokens": 3007974.0, "reward": 1.84375, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.06733075078398819, "frac_reward_zero_std": 0.0, "grad_norm": 3.34375, "kl": 0.04764822777360678, "learning_rate": 1.3406998158379374e-05, "loss": 0.0019, "num_tokens": 3015833.0, "reward": 1.7866380214691162, "reward_std": 0.29446762800216675, "rewards/fixed_code_pass_all_test_reward/mean": 0.7866379022598267, "rewards/fixed_code_pass_all_test_reward/std": 0.29446765780448914, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.06751521859435529, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.04429113166406751, "learning_rate": 1.3443830570902394e-05, "loss": 0.0018, "num_tokens": 3021396.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.06769968640472238, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.07754581165499985, "learning_rate": 1.3480662983425417e-05, "loss": 0.0031, "num_tokens": 3029470.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 301.125, "completions/mean_terminated_length": 301.125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.06788415421508946, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "kl": 0.07375993009191006, "learning_rate": 1.3517495395948436e-05, "loss": 0.003, "num_tokens": 3036751.0, "reward": 1.3515625, "reward_std": 0.7038934230804443, "rewards/fixed_code_pass_all_test_reward/mean": 0.6015625, "rewards/fixed_code_pass_all_test_reward/std": 0.31328028440475464, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.06806862202545656, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.027552447514608502, "learning_rate": 1.3554327808471458e-05, "loss": 0.0011, "num_tokens": 3049068.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 106.125, "completions/mean_terminated_length": 106.125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.06825308983582365, "frac_reward_zero_std": 0.0, "grad_norm": 3.640625, "kl": 0.05798104964196682, "learning_rate": 1.3591160220994477e-05, "loss": 0.0023, "num_tokens": 3052797.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 108.875, "completions/mean_terminated_length": 108.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.06843755764619074, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "kl": 0.04925201553851366, "learning_rate": 1.3627992633517497e-05, "loss": 0.002, "num_tokens": 3056548.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 246.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.06862202545655784, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.02224466612096876, "learning_rate": 1.3664825046040517e-05, "loss": 0.0009, "num_tokens": 3062290.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.06880649326692492, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.06345775816589594, "learning_rate": 1.3701657458563538e-05, "loss": 0.0025, "num_tokens": 3066456.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.06899096107729201, "frac_reward_zero_std": 1.0, "grad_norm": 0.265625, "kl": 0.0529457307420671, "learning_rate": 1.3738489871086557e-05, "loss": 0.0021, "num_tokens": 3074725.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.06917542888765911, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.04789333953522146, "learning_rate": 1.3775322283609577e-05, "loss": 0.0019, "num_tokens": 3079272.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.0693598966980262, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.04456760617904365, "learning_rate": 1.3812154696132598e-05, "loss": 0.0018, "num_tokens": 3086833.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 339.375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.06954436450839328, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.03201671864371747, "learning_rate": 1.3848987108655618e-05, "loss": 0.0013, "num_tokens": 3098020.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.06972883231876037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.017324706888757646, "learning_rate": 1.3885819521178638e-05, "loss": 0.0007, "num_tokens": 3105226.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 186.625, "completions/mean_terminated_length": 186.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.06991330012912747, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.029251458239741623, "learning_rate": 1.3922651933701659e-05, "loss": 0.0012, "num_tokens": 3111007.0, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.41052016615867615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 82.875, "completions/mean_terminated_length": 82.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.07009776793949456, "frac_reward_zero_std": 1.0, "grad_norm": 0.546875, "kl": 0.11373530700802803, "learning_rate": 1.3959484346224679e-05, "loss": 0.0045, "num_tokens": 3114486.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.07028223574986164, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.026900959899649024, "learning_rate": 1.3996316758747698e-05, "loss": 0.0011, "num_tokens": 3121550.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.07046670356022874, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.022108297096565366, "learning_rate": 1.403314917127072e-05, "loss": 0.0009, "num_tokens": 3133462.0, "reward": 1.25, "reward_std": 0.3505098223686218, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.3505098521709442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.07065117137059583, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.01975124899763614, "learning_rate": 1.4069981583793739e-05, "loss": 0.0008, "num_tokens": 3142124.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.07083563918096292, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.025328515330329537, "learning_rate": 1.4106813996316759e-05, "loss": 0.001, "num_tokens": 3151010.0, "reward": 1.7321428060531616, "reward_std": 0.3708029091358185, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.3708029091358185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 81.375, "completions/mean_terminated_length": 81.375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.07102010699133002, "frac_reward_zero_std": 0.0, "grad_norm": 3.984375, "kl": 0.06819826737046242, "learning_rate": 1.414364640883978e-05, "loss": 0.0027, "num_tokens": 3154485.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 305.25, "completions/mean_terminated_length": 305.25, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.0712045748016971, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.018316328176297247, "learning_rate": 1.41804788213628e-05, "loss": 0.0007, "num_tokens": 3167591.0, "reward": 1.5120967626571655, "reward_std": 0.5225514769554138, "rewards/fixed_code_pass_all_test_reward/mean": 0.5120967626571655, "rewards/fixed_code_pass_all_test_reward/std": 0.5225514769554138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 332.375, "completions/mean_terminated_length": 332.375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.07138904261206419, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.014335550600662827, "learning_rate": 1.421731123388582e-05, "loss": 0.0006, "num_tokens": 3175594.0, "reward": 1.0999999046325684, "reward_std": 0.06172135844826698, "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000894069672, "rewards/fixed_code_pass_all_test_reward/std": 0.06172134354710579, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.07157351042243129, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.03184652351774275, "learning_rate": 1.425414364640884e-05, "loss": 0.0013, "num_tokens": 3184228.0, "reward": 1.1375000476837158, "reward_std": 0.5328680276870728, "rewards/fixed_code_pass_all_test_reward/mean": 0.26249998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.3354397714138031, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 423.0, "completions/mean_terminated_length": 423.0, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.07175797823279838, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.014729015587363392, "learning_rate": 1.429097605893186e-05, "loss": 0.0006, "num_tokens": 3198060.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.07194244604316546, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.03247330989688635, "learning_rate": 1.432780847145488e-05, "loss": 0.0013, "num_tokens": 3205714.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 249.25, "completions/mean_terminated_length": 249.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.07212691385353256, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.02143077866639942, "learning_rate": 1.4364640883977903e-05, "loss": 0.0009, "num_tokens": 3214884.0, "reward": 1.5714285373687744, "reward_std": 0.7125760316848755, "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.42870157957077026, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 443.0, "completions/mean_terminated_length": 443.0, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.07231138166389965, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.025860476307570934, "learning_rate": 1.4401473296500922e-05, "loss": 0.001, "num_tokens": 3223676.0, "reward": 0.03846153989434242, "reward_std": 0.10878566652536392, "rewards/fixed_code_pass_all_test_reward/mean": 0.03846153989434242, "rewards/fixed_code_pass_all_test_reward/std": 0.10878566652536392, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.07249584947426674, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.012229728512465954, "learning_rate": 1.4438305709023944e-05, "loss": 0.0005, "num_tokens": 3233675.0, "reward": 1.3055555820465088, "reward_std": 0.4313548505306244, "rewards/fixed_code_pass_all_test_reward/mean": 0.3055555522441864, "rewards/fixed_code_pass_all_test_reward/std": 0.4313548505306244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 201.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.07268031728463382, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.01968642318388447, "learning_rate": 1.4475138121546963e-05, "loss": 0.0008, "num_tokens": 3242889.0, "reward": 1.0416666269302368, "reward_std": 0.06173158437013626, "rewards/fixed_code_pass_all_test_reward/mean": 0.0416666679084301, "rewards/fixed_code_pass_all_test_reward/std": 0.061731547117233276, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.07286478509500093, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "kl": 0.028383291326463223, "learning_rate": 1.4511970534069983e-05, "loss": 0.0011, "num_tokens": 3249167.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.07304925290536801, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.025469497311860323, "learning_rate": 1.4548802946593004e-05, "loss": 0.001, "num_tokens": 3256434.0, "reward": 1.433333396911621, "reward_std": 0.4062996208667755, "rewards/fixed_code_pass_all_test_reward/mean": 0.5583333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.4077620208263397, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.0732337207157351, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.019956157950218767, "learning_rate": 1.4585635359116024e-05, "loss": 0.0008, "num_tokens": 3265942.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.0734181885261022, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.035665380069985986, "learning_rate": 1.4622467771639043e-05, "loss": 0.0014, "num_tokens": 3270883.0, "reward": 1.1322674751281738, "reward_std": 0.654917299747467, "rewards/fixed_code_pass_all_test_reward/mean": 0.38226747512817383, "rewards/fixed_code_pass_all_test_reward/std": 0.39453884959220886, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 360.0, "completions/mean_terminated_length": 360.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.07360265633646929, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.01833764836192131, "learning_rate": 1.4659300184162065e-05, "loss": 0.0007, "num_tokens": 3280939.0, "reward": 1.3522727489471436, "reward_std": 0.17137201130390167, "rewards/fixed_code_pass_all_test_reward/mean": 0.35227274894714355, "rewards/fixed_code_pass_all_test_reward/std": 0.17137199640274048, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 231.375, "completions/mean_terminated_length": 231.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.07378712414683637, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.02949512260966003, "learning_rate": 1.4696132596685084e-05, "loss": 0.0012, "num_tokens": 3286934.0, "reward": 0.6607142686843872, "reward_std": 0.5555838942527771, "rewards/fixed_code_pass_all_test_reward/mean": 0.0357142873108387, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.07397159195720347, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.0227965428493917, "learning_rate": 1.4732965009208104e-05, "loss": 0.0009, "num_tokens": 3292792.0, "reward": 1.375, "reward_std": 0.21967828273773193, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.21967832744121552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.07415605976757056, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.016063445480540395, "learning_rate": 1.4769797421731125e-05, "loss": 0.0006, "num_tokens": 3299058.0, "reward": 1.185185194015503, "reward_std": 0.1533479392528534, "rewards/fixed_code_pass_all_test_reward/mean": 0.18518519401550293, "rewards/fixed_code_pass_all_test_reward/std": 0.153347909450531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.07434052757793765, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.0556350271217525, "learning_rate": 1.4806629834254145e-05, "loss": 0.0022, "num_tokens": 3303093.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.07452499538830475, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.03765711607411504, "learning_rate": 1.4843462246777164e-05, "loss": 0.0015, "num_tokens": 3308842.0, "reward": 1.7222222089767456, "reward_std": 0.2519763112068176, "rewards/fixed_code_pass_all_test_reward/mean": 0.7222222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.2519763112068176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.07470946319867183, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.037675265688449144, "learning_rate": 1.4880294659300186e-05, "loss": 0.0015, "num_tokens": 3314056.0, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 86.125, "completions/mean_terminated_length": 86.125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.07489393100903892, "frac_reward_zero_std": 1.0, "grad_norm": 0.31640625, "kl": 0.06568852253258228, "learning_rate": 1.4917127071823205e-05, "loss": 0.0026, "num_tokens": 3317481.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 304.0, "completions/mean_terminated_length": 304.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.07507839881940602, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.029842931078746915, "learning_rate": 1.4953959484346225e-05, "loss": 0.0012, "num_tokens": 3324513.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.0752628666297731, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.037822967395186424, "learning_rate": 1.4990791896869246e-05, "loss": 0.0015, "num_tokens": 3332800.0, "reward": 1.1666666269302368, "reward_std": 0.01484791748225689, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.014847845770418644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.07544733444014019, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.022158857202157378, "learning_rate": 1.5027624309392266e-05, "loss": 0.0009, "num_tokens": 3338779.0, "reward": 0.5723683834075928, "reward_std": 0.6405611038208008, "rewards/fixed_code_pass_all_test_reward/mean": 0.07236842066049576, "rewards/fixed_code_pass_all_test_reward/std": 0.20468878746032715, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.07563180225050728, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.04054498765617609, "learning_rate": 1.5064456721915286e-05, "loss": 0.0016, "num_tokens": 3345437.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 394.625, "completions/mean_terminated_length": 394.625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.07581627006087438, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.020458632614463568, "learning_rate": 1.5101289134438307e-05, "loss": 0.0008, "num_tokens": 3356442.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 283.25, "completions/mean_terminated_length": 283.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.07600073787124147, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.03337156749330461, "learning_rate": 1.5138121546961326e-05, "loss": 0.0013, "num_tokens": 3367060.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.07618520568160855, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.03484258451499045, "learning_rate": 1.5174953959484346e-05, "loss": 0.0014, "num_tokens": 3372058.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.07636967349197565, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.037658774526789784, "learning_rate": 1.5211786372007367e-05, "loss": 0.0015, "num_tokens": 3380342.0, "reward": 1.46875, "reward_std": 0.4677801728248596, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.4677802324295044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 202.375, "completions/mean_terminated_length": 202.375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.07655414130234274, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.03212064364925027, "learning_rate": 1.5248618784530389e-05, "loss": 0.0013, "num_tokens": 3386401.0, "reward": 1.0054347515106201, "reward_std": 0.015371870249509811, "rewards/fixed_code_pass_all_test_reward/mean": 0.005434782709926367, "rewards/fixed_code_pass_all_test_reward/std": 0.015371887944638729, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 118.75, "completions/mean_terminated_length": 118.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.07673860911270983, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.05387245863676071, "learning_rate": 1.528545119705341e-05, "loss": 0.0022, "num_tokens": 3390279.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 225.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.07692307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.03960717679001391, "learning_rate": 1.532228360957643e-05, "loss": 0.0016, "num_tokens": 3399272.0, "reward": 1.8712120056152344, "reward_std": 0.32358160614967346, "rewards/fixed_code_pass_all_test_reward/mean": 0.8712121248245239, "rewards/fixed_code_pass_all_test_reward/std": 0.32358160614967346, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.07710754473344401, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.05044435535091907, "learning_rate": 1.535911602209945e-05, "loss": 0.002, "num_tokens": 3407230.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 611.5, "completions/mean_terminated_length": 611.5, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.0772920125438111, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.013821795582771301, "learning_rate": 1.539594843462247e-05, "loss": 0.0006, "num_tokens": 3423586.0, "reward": 1.46875, "reward_std": 0.6605936288833618, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.3764851689338684, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0774764803541782, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.04522925755009055, "learning_rate": 1.543278084714549e-05, "loss": 0.0018, "num_tokens": 3431975.0, "reward": 1.018617033958435, "reward_std": 0.05265690013766289, "rewards/fixed_code_pass_all_test_reward/mean": 0.018617020919919014, "rewards/fixed_code_pass_all_test_reward/std": 0.05265688896179199, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 464.125, "completions/mean_terminated_length": 464.125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.07766094816454529, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.02720178826712072, "learning_rate": 1.546961325966851e-05, "loss": 0.0011, "num_tokens": 3440640.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 146.375, "completions/mean_terminated_length": 146.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.07784541597491237, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.04507256951183081, "learning_rate": 1.550644567219153e-05, "loss": 0.0018, "num_tokens": 3444835.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 255.25, "completions/mean_terminated_length": 255.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.07802988378527947, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.042787248035892844, "learning_rate": 1.554327808471455e-05, "loss": 0.0017, "num_tokens": 3457061.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 340.125, "completions/mean_terminated_length": 340.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.07821435159564656, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.020434472477063537, "learning_rate": 1.558011049723757e-05, "loss": 0.0008, "num_tokens": 3466542.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.07839881940601365, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.03905394906178117, "learning_rate": 1.561694290976059e-05, "loss": 0.0016, "num_tokens": 3470681.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.07858328721638075, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.029699954204261303, "learning_rate": 1.565377532228361e-05, "loss": 0.0012, "num_tokens": 3475088.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.07876775502674783, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0475653032772243, "learning_rate": 1.5690607734806633e-05, "loss": 0.0019, "num_tokens": 3483323.0, "reward": 1.7282609939575195, "reward_std": 0.030743766576051712, "rewards/fixed_code_pass_all_test_reward/mean": 0.72826087474823, "rewards/fixed_code_pass_all_test_reward/std": 0.030743766576051712, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.07895222283711492, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0530063568148762, "learning_rate": 1.5727440147329652e-05, "loss": 0.0021, "num_tokens": 3489236.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 72.25, "completions/mean_terminated_length": 72.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.07913669064748201, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.09762355033308268, "learning_rate": 1.5764272559852672e-05, "loss": 0.0039, "num_tokens": 3492598.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 144.5, "completions/mean_terminated_length": 144.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.07932115845784911, "frac_reward_zero_std": 0.0, "grad_norm": 3.34375, "kl": 0.052001309813931584, "learning_rate": 1.580110497237569e-05, "loss": 0.0021, "num_tokens": 3496690.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.0795056262682162, "frac_reward_zero_std": 0.0, "grad_norm": 3.84375, "kl": 0.0688655492849648, "learning_rate": 1.583793738489871e-05, "loss": 0.0028, "num_tokens": 3500553.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.07969009407858328, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.028024996630847454, "learning_rate": 1.587476979742173e-05, "loss": 0.0011, "num_tokens": 3507778.0, "reward": 1.6959459781646729, "reward_std": 0.2517806589603424, "rewards/fixed_code_pass_all_test_reward/mean": 0.6959459781646729, "rewards/fixed_code_pass_all_test_reward/std": 0.2517806887626648, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 584.75, "completions/mean_terminated_length": 584.75, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.07987456188895038, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.015700081712566316, "learning_rate": 1.5911602209944754e-05, "loss": 0.0006, "num_tokens": 3519592.0, "reward": 1.2249999046325684, "reward_std": 0.5897941589355469, "rewards/fixed_code_pass_all_test_reward/mean": 0.3499999940395355, "rewards/fixed_code_pass_all_test_reward/std": 0.4011887013912201, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 325.75, "completions/mean_terminated_length": 325.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.08005902969931747, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.041107492754235864, "learning_rate": 1.5948434622467773e-05, "loss": 0.0016, "num_tokens": 3529854.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 268.875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.08024349750968456, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.029832568718120456, "learning_rate": 1.5985267034990793e-05, "loss": 0.0012, "num_tokens": 3536821.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 218.375, "completions/mean_terminated_length": 218.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.08042796532005166, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.031699523562565446, "learning_rate": 1.6022099447513812e-05, "loss": 0.0013, "num_tokens": 3544504.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 278.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.08061243313041874, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.04421759117394686, "learning_rate": 1.6058931860036832e-05, "loss": 0.0018, "num_tokens": 3554793.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 181.375, "completions/mean_terminated_length": 181.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.08079690094078583, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.07652451656758785, "learning_rate": 1.6095764272559855e-05, "loss": 0.0031, "num_tokens": 3559300.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 348.875, "completions/mean_terminated_length": 348.875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.08098136875115293, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.027813134714961052, "learning_rate": 1.6132596685082875e-05, "loss": 0.0011, "num_tokens": 3567771.0, "reward": 1.1041666269302368, "reward_std": 0.2946277856826782, "rewards/fixed_code_pass_all_test_reward/mean": 0.1041666641831398, "rewards/fixed_code_pass_all_test_reward/std": 0.2946278154850006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.08116583656152002, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.062437888234853745, "learning_rate": 1.6169429097605894e-05, "loss": 0.0025, "num_tokens": 3573583.0, "reward": 1.4090908765792847, "reward_std": 0.4907647669315338, "rewards/fixed_code_pass_all_test_reward/mean": 0.40909093618392944, "rewards/fixed_code_pass_all_test_reward/std": 0.4907647669315338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.0813503043718871, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.07700727181509137, "learning_rate": 1.6206261510128917e-05, "loss": 0.0031, "num_tokens": 3580719.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.0815347721822542, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.034489301266148686, "learning_rate": 1.6243093922651937e-05, "loss": 0.0014, "num_tokens": 3590008.0, "reward": 1.8156249523162842, "reward_std": 0.07898630201816559, "rewards/fixed_code_pass_all_test_reward/mean": 0.815625011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.07898631691932678, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 94.875, "completions/mean_terminated_length": 94.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.08171923999262129, "frac_reward_zero_std": 1.0, "grad_norm": 0.2412109375, "kl": 0.08503572596237063, "learning_rate": 1.6279926335174956e-05, "loss": 0.0034, "num_tokens": 3593567.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 288.25, "completions/mean_terminated_length": 288.25, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.08190370780298838, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.05485189985483885, "learning_rate": 1.6316758747697976e-05, "loss": 0.0022, "num_tokens": 3600569.0, "reward": 1.5436046123504639, "reward_std": 0.7216197848320007, "rewards/fixed_code_pass_all_test_reward/mean": 0.7936046123504639, "rewards/fixed_code_pass_all_test_reward/std": 0.3348093032836914, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 280.25, "completions/mean_terminated_length": 280.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.08208817561335546, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.052216792944818735, "learning_rate": 1.6353591160220996e-05, "loss": 0.0021, "num_tokens": 3609915.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 508.125, "completions/mean_terminated_length": 508.125, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.08227264342372256, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.02243325160816312, "learning_rate": 1.6390423572744015e-05, "loss": 0.0009, "num_tokens": 3627356.0, "reward": 1.2083333730697632, "reward_std": 0.8474987149238586, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4607619643211365, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.08245711123408965, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0778504116460681, "learning_rate": 1.6427255985267035e-05, "loss": 0.0031, "num_tokens": 3635785.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.08264157904445674, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0684905624948442, "learning_rate": 1.6464088397790058e-05, "loss": 0.0027, "num_tokens": 3645836.0, "reward": 1.665624976158142, "reward_std": 0.7132592797279358, "rewards/fixed_code_pass_all_test_reward/mean": 0.7906249761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.3972983956336975, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 230.625, "completions/mean_terminated_length": 230.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.08282604685482384, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.025190188782289624, "learning_rate": 1.6500920810313078e-05, "loss": 0.001, "num_tokens": 3653505.0, "reward": 1.1304347515106201, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1304347813129425, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.08301051466519092, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.033397057093679905, "learning_rate": 1.6537753222836097e-05, "loss": 0.0013, "num_tokens": 3658772.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.08319498247555801, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.08151811547577381, "learning_rate": 1.6574585635359117e-05, "loss": 0.0033, "num_tokens": 3668949.0, "reward": 1.0254629850387573, "reward_std": 0.9021247029304504, "rewards/fixed_code_pass_all_test_reward/mean": 0.40046295523643494, "rewards/fixed_code_pass_all_test_reward/std": 0.45024189352989197, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.08337945028592511, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.029417765326797962, "learning_rate": 1.6611418047882136e-05, "loss": 0.0012, "num_tokens": 3675893.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 275.625, "completions/mean_terminated_length": 275.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.0835639180962922, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.021990789915435016, "learning_rate": 1.6648250460405156e-05, "loss": 0.0009, "num_tokens": 3681930.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 129.125, "completions/mean_terminated_length": 129.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.08374838590665928, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.07840617559850216, "learning_rate": 1.668508287292818e-05, "loss": 0.0031, "num_tokens": 3685971.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.08393285371702638, "frac_reward_zero_std": 1.0, "grad_norm": 0.5546875, "kl": 0.1240438362583518, "learning_rate": 1.67219152854512e-05, "loss": 0.005, "num_tokens": 3692018.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.08411732152739347, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.06656180880963802, "learning_rate": 1.6758747697974218e-05, "loss": 0.0027, "num_tokens": 3695861.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 154.125, "completions/mean_terminated_length": 154.125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.08430178933776056, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.06308302283287048, "learning_rate": 1.6795580110497238e-05, "loss": 0.0025, "num_tokens": 3700102.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 150.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.08448625714812766, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.06178617151454091, "learning_rate": 1.6832412523020257e-05, "loss": 0.0025, "num_tokens": 3708137.0, "reward": 1.778846263885498, "reward_std": 0.23776039481163025, "rewards/fixed_code_pass_all_test_reward/mean": 0.7788461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.23776039481163025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.08467072495849474, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0745656113140285, "learning_rate": 1.6869244935543277e-05, "loss": 0.003, "num_tokens": 3716422.0, "reward": 1.6666667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 240.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.08485519276886183, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.0596267965156585, "learning_rate": 1.69060773480663e-05, "loss": 0.0024, "num_tokens": 3723082.0, "reward": 1.1124999523162842, "reward_std": 0.06943647563457489, "rewards/fixed_code_pass_all_test_reward/mean": 0.11250000447034836, "rewards/fixed_code_pass_all_test_reward/std": 0.06943651288747787, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 106.125, "completions/mean_terminated_length": 106.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.08503966057922892, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.07483089808374643, "learning_rate": 1.694290976058932e-05, "loss": 0.003, "num_tokens": 3726763.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.08522412838959602, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.07481989730149508, "learning_rate": 1.6979742173112343e-05, "loss": 0.003, "num_tokens": 3733311.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.0854085961999631, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.029285472352057695, "learning_rate": 1.7016574585635362e-05, "loss": 0.0012, "num_tokens": 3740414.0, "reward": 1.5595238208770752, "reward_std": 0.6387497782707214, "rewards/fixed_code_pass_all_test_reward/mean": 0.6845238208770752, "rewards/fixed_code_pass_all_test_reward/std": 0.2956739068031311, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.08559306401033019, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.07635807478800416, "learning_rate": 1.7053406998158382e-05, "loss": 0.0031, "num_tokens": 3748018.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 78.75, "completions/mean_terminated_length": 78.75, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.08577753182069729, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.08449677424505353, "learning_rate": 1.70902394106814e-05, "loss": 0.0034, "num_tokens": 3753976.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.08596199963106438, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.054731505922973156, "learning_rate": 1.712707182320442e-05, "loss": 0.0022, "num_tokens": 3760323.0, "reward": 1.0520832538604736, "reward_std": 0.14731387794017792, "rewards/fixed_code_pass_all_test_reward/mean": 0.0520833320915699, "rewards/fixed_code_pass_all_test_reward/std": 0.1473139077425003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.08614646744143147, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.15177931962534785, "learning_rate": 1.716390423572744e-05, "loss": 0.0061, "num_tokens": 3768247.0, "reward": 1.2604166269302368, "reward_std": 0.4568428099155426, "rewards/fixed_code_pass_all_test_reward/mean": 0.2604166567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.4568428099155426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 100.25, "completions/mean_terminated_length": 100.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.08633093525179857, "frac_reward_zero_std": 0.0, "grad_norm": 3.734375, "kl": 0.0947853890247643, "learning_rate": 1.7200736648250464e-05, "loss": 0.0038, "num_tokens": 3774121.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 104.125, "completions/mean_terminated_length": 104.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.08651540306216565, "frac_reward_zero_std": 1.0, "grad_norm": 0.625, "kl": 0.13161328341811895, "learning_rate": 1.7237569060773483e-05, "loss": 0.0053, "num_tokens": 3782594.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.08669987087253274, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.03714871173724532, "learning_rate": 1.7274401473296503e-05, "loss": 0.0015, "num_tokens": 3791632.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 162.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.08688433868289984, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.08777307160198689, "learning_rate": 1.7311233885819523e-05, "loss": 0.0035, "num_tokens": 3802531.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 555.0, "completions/mean_terminated_length": 555.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.08706880649326693, "frac_reward_zero_std": 1.0, "grad_norm": 0.2275390625, "kl": 0.030425833945628256, "learning_rate": 1.7348066298342542e-05, "loss": 0.0012, "num_tokens": 3816595.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.08725327430363401, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.06503449892625213, "learning_rate": 1.7384898710865562e-05, "loss": 0.0026, "num_tokens": 3825446.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.08743774211400111, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.05034553771838546, "learning_rate": 1.7421731123388585e-05, "loss": 0.002, "num_tokens": 3835043.0, "reward": 0.8806818127632141, "reward_std": 0.3562045097351074, "rewards/fixed_code_pass_all_test_reward/mean": 0.005681818351149559, "rewards/fixed_code_pass_all_test_reward/std": 0.016070609912276268, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 126.625, "completions/mean_terminated_length": 126.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.0876222099243682, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.08125527389347553, "learning_rate": 1.7458563535911604e-05, "loss": 0.0033, "num_tokens": 3842640.0, "reward": 1.0049999952316284, "reward_std": 0.014142122119665146, "rewards/fixed_code_pass_all_test_reward/mean": 0.004999999888241291, "rewards/fixed_code_pass_all_test_reward/std": 0.01414213515818119, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 427.0, "completions/mean_terminated_length": 427.0, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.08780667773473529, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.021837759763002396, "learning_rate": 1.7495395948434624e-05, "loss": 0.0009, "num_tokens": 3856568.0, "reward": 1.6782786846160889, "reward_std": 0.7072255611419678, "rewards/fixed_code_pass_all_test_reward/mean": 0.8032786846160889, "rewards/fixed_code_pass_all_test_reward/std": 0.38165396451950073, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.08799114554510237, "frac_reward_zero_std": 0.0, "grad_norm": 4.1875, "kl": 0.09121084213256836, "learning_rate": 1.7532228360957644e-05, "loss": 0.0036, "num_tokens": 3860303.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.08817561335546947, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "kl": 0.06891392637044191, "learning_rate": 1.7569060773480663e-05, "loss": 0.0028, "num_tokens": 3864125.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.08836008116583656, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.058195704594254494, "learning_rate": 1.7605893186003683e-05, "loss": 0.0023, "num_tokens": 3870411.0, "reward": 1.6377551555633545, "reward_std": 0.3077217936515808, "rewards/fixed_code_pass_all_test_reward/mean": 0.6377551555633545, "rewards/fixed_code_pass_all_test_reward/std": 0.3077217936515808, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.08854454897620365, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.04514938546344638, "learning_rate": 1.7642725598526706e-05, "loss": 0.0018, "num_tokens": 3874844.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 92.5, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.08872901678657075, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.08823577640578151, "learning_rate": 1.7679558011049725e-05, "loss": 0.0035, "num_tokens": 3880648.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1206.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 376.25, "completions/mean_terminated_length": 376.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.08891348459693783, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.05095439846627414, "learning_rate": 1.7716390423572745e-05, "loss": 0.002, "num_tokens": 3890906.0, "reward": 1.3909574747085571, "reward_std": 0.6289011836051941, "rewards/fixed_code_pass_all_test_reward/mean": 0.5159574747085571, "rewards/fixed_code_pass_all_test_reward/std": 0.3508565425872803, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 139.5, "completions/mean_terminated_length": 139.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.08909795240730492, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.12065101834014058, "learning_rate": 1.7753222836095765e-05, "loss": 0.0048, "num_tokens": 3901094.0, "reward": 1.7163461446762085, "reward_std": 0.4227958619594574, "rewards/fixed_code_pass_all_test_reward/mean": 0.7163461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.4227958619594574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.08928242021767202, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.08889650832861662, "learning_rate": 1.7790055248618784e-05, "loss": 0.0036, "num_tokens": 3906619.0, "reward": 1.875, "reward_std": 0.23953145742416382, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.23953145742416382, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.08946688802803911, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.06771444482728839, "learning_rate": 1.7826887661141807e-05, "loss": 0.0027, "num_tokens": 3913067.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 340.125, "completions/mean_terminated_length": 340.125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.0896513558384062, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.024872958892956376, "learning_rate": 1.7863720073664827e-05, "loss": 0.001, "num_tokens": 3924292.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 146.625, "completions/mean_terminated_length": 146.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.0898358236487733, "frac_reward_zero_std": 0.0, "grad_norm": 3.4375, "kl": 0.09502951940521598, "learning_rate": 1.7900552486187846e-05, "loss": 0.0038, "num_tokens": 3931601.0, "reward": 1.6124999523162842, "reward_std": 0.3632689118385315, "rewards/fixed_code_pass_all_test_reward/mean": 0.6124999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.3632689118385315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.09002029145914038, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.10425100848078728, "learning_rate": 1.793738489871087e-05, "loss": 0.0042, "num_tokens": 3939515.0, "reward": 1.3636364936828613, "reward_std": 0.3977504074573517, "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.39775049686431885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 866.25, "completions/mean_terminated_length": 697.4285888671875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.09020475926950747, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.02450143703026697, "learning_rate": 1.797421731123389e-05, "loss": 0.001, "num_tokens": 3957501.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 320.875, "completions/mean_terminated_length": 320.875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.09038922707987457, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.042886950133834034, "learning_rate": 1.801104972375691e-05, "loss": 0.0017, "num_tokens": 3962964.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.09057369489024165, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.08385624922811985, "learning_rate": 1.804788213627993e-05, "loss": 0.0034, "num_tokens": 3968623.0, "reward": 1.524999976158142, "reward_std": 0.2677063047885895, "rewards/fixed_code_pass_all_test_reward/mean": 0.5249999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2677063047885895, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.09075816270060874, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.12646083580330014, "learning_rate": 1.8084714548802948e-05, "loss": 0.0051, "num_tokens": 3978391.0, "reward": 1.09375, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.09094263051097584, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.053688037442043424, "learning_rate": 1.8121546961325968e-05, "loss": 0.0021, "num_tokens": 3986223.0, "reward": 1.8461538553237915, "reward_std": 0.22462047636508942, "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.22462047636508942, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 533.25, "completions/mean_terminated_length": 533.25, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.09112709832134293, "frac_reward_zero_std": 1.0, "grad_norm": 0.041015625, "kl": 0.01679092855192721, "learning_rate": 1.815837937384899e-05, "loss": 0.0007, "num_tokens": 4002313.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.09131156613171001, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.056114659644663334, "learning_rate": 1.819521178637201e-05, "loss": 0.0022, "num_tokens": 4012506.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 587.375, "completions/mean_terminated_length": 587.375, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.0914960339420771, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.025889444164931774, "learning_rate": 1.823204419889503e-05, "loss": 0.001, "num_tokens": 4024421.0, "reward": 1.1323529481887817, "reward_std": 0.46450909972190857, "rewards/fixed_code_pass_all_test_reward/mean": 0.25735294818878174, "rewards/fixed_code_pass_all_test_reward/std": 0.13129831850528717, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.0916805017524442, "frac_reward_zero_std": 0.0, "grad_norm": 4.40625, "kl": 0.059315916150808334, "learning_rate": 1.826887661141805e-05, "loss": 0.0024, "num_tokens": 4032530.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.09186496956281129, "frac_reward_zero_std": 0.0, "grad_norm": 3.640625, "kl": 0.07315297843888402, "learning_rate": 1.830570902394107e-05, "loss": 0.0029, "num_tokens": 4036234.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.09204943737317837, "frac_reward_zero_std": 0.0, "grad_norm": 3.953125, "kl": 0.09299248037859797, "learning_rate": 1.834254143646409e-05, "loss": 0.0037, "num_tokens": 4040730.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.09223390518354548, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.043008289532735944, "learning_rate": 1.8379373848987108e-05, "loss": 0.0017, "num_tokens": 4047879.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 452.25, "completions/mean_terminated_length": 452.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.09241837299391256, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.029030569130554795, "learning_rate": 1.841620626151013e-05, "loss": 0.0012, "num_tokens": 4054481.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 323.625, "completions/mean_terminated_length": 323.625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.09260284080427965, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.033553194254636765, "learning_rate": 1.845303867403315e-05, "loss": 0.0013, "num_tokens": 4062046.0, "reward": 1.7857142686843872, "reward_std": 0.05090690031647682, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.05090690031647682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 178.875, "completions/mean_terminated_length": 178.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.09278730861464675, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.036709454376250505, "learning_rate": 1.848987108655617e-05, "loss": 0.0015, "num_tokens": 4067573.0, "reward": 1.4612069129943848, "reward_std": 0.21939851343631744, "rewards/fixed_code_pass_all_test_reward/mean": 0.4612068831920624, "rewards/fixed_code_pass_all_test_reward/std": 0.21939854323863983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.09297177642501384, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0476953920442611, "learning_rate": 1.852670349907919e-05, "loss": 0.0019, "num_tokens": 4074146.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.09315624423538092, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.048187171341851354, "learning_rate": 1.856353591160221e-05, "loss": 0.0019, "num_tokens": 4082414.0, "reward": 0.9038461446762085, "reward_std": 0.6235719323158264, "rewards/fixed_code_pass_all_test_reward/mean": 0.1538461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.3458484709262848, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.09334071204574802, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.06846186763141304, "learning_rate": 1.860036832412523e-05, "loss": 0.0027, "num_tokens": 4086043.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 295.0, "completions/mean_terminated_length": 295.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.09352517985611511, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.04031226853840053, "learning_rate": 1.8637200736648252e-05, "loss": 0.0016, "num_tokens": 4091651.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.0937096476664822, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.03689823695458472, "learning_rate": 1.8674033149171272e-05, "loss": 0.0015, "num_tokens": 4099707.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 163.42857360839844, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.0938941154768493, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.034453974149073474, "learning_rate": 1.8710865561694295e-05, "loss": 0.0014, "num_tokens": 4105843.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 306.625, "completions/mean_terminated_length": 306.625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.09407858328721638, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.03440332296304405, "learning_rate": 1.8747697974217315e-05, "loss": 0.0014, "num_tokens": 4113384.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.09426305109758347, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.04078325326554477, "learning_rate": 1.8784530386740334e-05, "loss": 0.0016, "num_tokens": 4122661.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 252.375, "completions/mean_terminated_length": 252.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.09444751890795056, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0530955201247707, "learning_rate": 1.8821362799263354e-05, "loss": 0.0021, "num_tokens": 4130584.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.09463198671831766, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.029481176752597094, "learning_rate": 1.8858195211786373e-05, "loss": 0.0012, "num_tokens": 4138903.0, "reward": 1.345588207244873, "reward_std": 0.371284544467926, "rewards/fixed_code_pass_all_test_reward/mean": 0.34558823704719543, "rewards/fixed_code_pass_all_test_reward/std": 0.371284544467926, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 140.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.09481645452868474, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.07821941236034036, "learning_rate": 1.8895027624309393e-05, "loss": 0.0031, "num_tokens": 4143894.0, "reward": 1.9029605388641357, "reward_std": 0.051172200590372086, "rewards/fixed_code_pass_all_test_reward/mean": 0.9029605388641357, "rewards/fixed_code_pass_all_test_reward/std": 0.05117219686508179, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 955.0, "completions/mean_terminated_length": 299.20001220703125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.09500092233905183, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.03327434364473447, "learning_rate": 1.8931860036832416e-05, "loss": 0.0013, "num_tokens": 4154654.0, "reward": 1.0, "reward_std": 1.0690449476242065, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 836.5, "completions/mean_terminated_length": 663.4285888671875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.09518539014941893, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.03199512499850243, "learning_rate": 1.8968692449355436e-05, "loss": 0.0013, "num_tokens": 4164082.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 611.375, "completions/mean_terminated_length": 611.375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.09536985795978602, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.020757006714120507, "learning_rate": 1.9005524861878455e-05, "loss": 0.0008, "num_tokens": 4178197.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 1000.25, "completions/mean_terminated_length": 371.6000061035156, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.0955543257701531, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.03325457152095623, "learning_rate": 1.9042357274401475e-05, "loss": 0.0013, "num_tokens": 4191391.0, "reward": 0.7142857313156128, "reward_std": 0.591484785079956, "rewards/fixed_code_pass_all_test_reward/mean": 0.0892857164144516, "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 245.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.0957387935805202, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.022311314940452576, "learning_rate": 1.9079189686924494e-05, "loss": 0.0009, "num_tokens": 4200357.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.09592326139088729, "frac_reward_zero_std": 1.0, "grad_norm": 2.4375, "kl": 0.16489154589362442, "learning_rate": 1.9116022099447514e-05, "loss": 0.0066, "num_tokens": 4207691.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 229.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.09610772920125438, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.04166472889482975, "learning_rate": 1.9152854511970537e-05, "loss": 0.0017, "num_tokens": 4217347.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.09629219701162148, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.04268402443267405, "learning_rate": 1.9189686924493557e-05, "loss": 0.0017, "num_tokens": 4226272.0, "reward": 1.62801194190979, "reward_std": 0.3511698842048645, "rewards/fixed_code_pass_all_test_reward/mean": 0.7530120611190796, "rewards/fixed_code_pass_all_test_reward/std": 0.006440025754272938, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 222.875, "completions/mean_terminated_length": 222.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.09647666482198856, "frac_reward_zero_std": 1.0, "grad_norm": 0.2392578125, "kl": 0.05090503324754536, "learning_rate": 1.9226519337016576e-05, "loss": 0.002, "num_tokens": 4236791.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.09666113263235565, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.02394133829511702, "learning_rate": 1.9263351749539596e-05, "loss": 0.001, "num_tokens": 4242786.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.09684560044272275, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.05782023863866925, "learning_rate": 1.9300184162062615e-05, "loss": 0.0023, "num_tokens": 4250306.0, "reward": 1.6749999523162842, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 433.0, "completions/mean_terminated_length": 433.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.09703006825308984, "frac_reward_zero_std": 0.0, "grad_norm": 0.74609375, "kl": 0.02648278814740479, "learning_rate": 1.9337016574585635e-05, "loss": 0.0011, "num_tokens": 4262554.0, "reward": 1.7083332538604736, "reward_std": 0.20138409733772278, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.20138409733772278, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.09721453606345692, "frac_reward_zero_std": 1.0, "grad_norm": 0.1513671875, "kl": 0.05078233755193651, "learning_rate": 1.9373848987108658e-05, "loss": 0.002, "num_tokens": 4269833.0, "reward": 1.8333332538604736, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 337.125, "completions/mean_terminated_length": 337.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.09739900387382401, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.04853670235024765, "learning_rate": 1.9410681399631678e-05, "loss": 0.0019, "num_tokens": 4279602.0, "reward": 1.078125, "reward_std": 0.0646936446428299, "rewards/fixed_code_pass_all_test_reward/mean": 0.078125, "rewards/fixed_code_pass_all_test_reward/std": 0.06469365209341049, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 192.25, "completions/mean_terminated_length": 192.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.09758347168419111, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.10749398544430733, "learning_rate": 1.9447513812154697e-05, "loss": 0.0043, "num_tokens": 4286868.0, "reward": 1.5, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 141.625, "completions/mean_terminated_length": 141.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.0977679394945582, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.11041943402960896, "learning_rate": 1.9484346224677717e-05, "loss": 0.0044, "num_tokens": 4296081.0, "reward": 1.6286232471466064, "reward_std": 0.5126267671585083, "rewards/fixed_code_pass_all_test_reward/mean": 0.6286231875419617, "rewards/fixed_code_pass_all_test_reward/std": 0.5126267671585083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 144.5, "completions/mean_terminated_length": 144.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.09795240730492528, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.07248743204399943, "learning_rate": 1.9521178637200737e-05, "loss": 0.0029, "num_tokens": 4304669.0, "reward": 1.225000023841858, "reward_std": 0.328416109085083, "rewards/fixed_code_pass_all_test_reward/mean": 0.22500000894069672, "rewards/fixed_code_pass_all_test_reward/std": 0.328416109085083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.09813687511529239, "frac_reward_zero_std": 1.0, "grad_norm": 0.1865234375, "kl": 0.07099854201078415, "learning_rate": 1.9558011049723756e-05, "loss": 0.0028, "num_tokens": 4312655.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 86.5, "completions/mean_terminated_length": 86.5, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.09832134292565947, "frac_reward_zero_std": 0.0, "grad_norm": 4.625, "kl": 0.1090422379784286, "learning_rate": 1.959484346224678e-05, "loss": 0.0044, "num_tokens": 4316027.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.09850581073602656, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.08461354207247496, "learning_rate": 1.96316758747698e-05, "loss": 0.0034, "num_tokens": 4323307.0, "reward": 1.807692289352417, "reward_std": 0.15924587845802307, "rewards/fixed_code_pass_all_test_reward/mean": 0.807692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.15924589335918427, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 278.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.09869027854639366, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.05451785121113062, "learning_rate": 1.9668508287292822e-05, "loss": 0.0022, "num_tokens": 4330180.0, "reward": 1.0208333730697632, "reward_std": 0.03857587277889252, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.03857583925127983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 259.375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.09887474635676075, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.03776137938257307, "learning_rate": 1.970534069981584e-05, "loss": 0.0015, "num_tokens": 4338767.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.09905921416712783, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.08948668790981174, "learning_rate": 1.974217311233886e-05, "loss": 0.0036, "num_tokens": 4345966.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.09924368197749493, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.037606473080813885, "learning_rate": 1.977900552486188e-05, "loss": 0.0015, "num_tokens": 4357362.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 172.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.09942814978786202, "frac_reward_zero_std": 0.0, "grad_norm": 6.5, "kl": 0.09733763337135315, "learning_rate": 1.98158379373849e-05, "loss": 0.0039, "num_tokens": 4365859.0, "reward": 0.4910714626312256, "reward_std": 0.6791753768920898, "rewards/fixed_code_pass_all_test_reward/mean": 0.1160714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.16614960134029388, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 472.5, "completions/mean_terminated_length": 247.4285888671875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.0996126175982291, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.0417682021507062, "learning_rate": 1.985267034990792e-05, "loss": 0.0017, "num_tokens": 4372599.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.0997970854085962, "frac_reward_zero_std": 1.0, "grad_norm": 0.578125, "kl": 0.08180084056220949, "learning_rate": 1.9889502762430943e-05, "loss": 0.0033, "num_tokens": 4376270.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.09998155321896329, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.08208787115290761, "learning_rate": 1.9926335174953962e-05, "loss": 0.0033, "num_tokens": 4385638.0, "reward": 1.4166666269302368, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.10016602102933038, "frac_reward_zero_std": 1.0, "grad_norm": 0.224609375, "kl": 0.08298074221238494, "learning_rate": 1.9963167587476982e-05, "loss": 0.0033, "num_tokens": 4394836.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.10035048883969747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.035807430278509855, "learning_rate": 2e-05, "loss": 0.0014, "num_tokens": 4402521.0, "reward": 1.1875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.10053495665006457, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.045798078179359436, "learning_rate": 1.999999792610797e-05, "loss": 0.0018, "num_tokens": 4409502.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 158.875, "completions/mean_terminated_length": 158.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.10071942446043165, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.04011172242462635, "learning_rate": 1.999999170443274e-05, "loss": 0.0016, "num_tokens": 4413829.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.10090389227079874, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.10819826228544116, "learning_rate": 1.999998133497689e-05, "loss": 0.0043, "num_tokens": 4418795.0, "reward": 1.640625, "reward_std": 0.4974825084209442, "rewards/fixed_code_pass_all_test_reward/mean": 0.640625, "rewards/fixed_code_pass_all_test_reward/std": 0.4974825084209442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.10108836008116584, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.1391353285871446, "learning_rate": 1.9999966817744715e-05, "loss": 0.0056, "num_tokens": 4427218.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 178.0, "completions/mean_terminated_length": 178.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.10127282789153293, "frac_reward_zero_std": 0.0, "grad_norm": 3.328125, "kl": 0.06142499949783087, "learning_rate": 1.9999948152742242e-05, "loss": 0.0025, "num_tokens": 4434746.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.10145729570190001, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.04623970249667764, "learning_rate": 1.9999925339977215e-05, "loss": 0.0018, "num_tokens": 4439603.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 170.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.10164176351226711, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.1099438089877367, "learning_rate": 1.9999898379459094e-05, "loss": 0.0044, "num_tokens": 4448316.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1018262313226342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.03206371236592531, "learning_rate": 1.9999867271199066e-05, "loss": 0.0013, "num_tokens": 4452313.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.10201069913300129, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.03684836090542376, "learning_rate": 1.9999832015210024e-05, "loss": 0.0015, "num_tokens": 4459256.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.10219516694336839, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.07518244232051075, "learning_rate": 1.9999792611506596e-05, "loss": 0.003, "num_tokens": 4465889.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 162.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.10237963475373547, "frac_reward_zero_std": 1.0, "grad_norm": 0.90625, "kl": 0.06855054013431072, "learning_rate": 1.9999749060105128e-05, "loss": 0.0027, "num_tokens": 4471228.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.10256410256410256, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.07433869037777185, "learning_rate": 1.9999701361023688e-05, "loss": 0.003, "num_tokens": 4480545.0, "reward": 1.0833333730697632, "reward_std": 0.2357023060321808, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022911310196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 92.625, "completions/mean_terminated_length": 92.625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.10274857037446966, "frac_reward_zero_std": 0.0, "grad_norm": 5.15625, "kl": 0.14416034519672394, "learning_rate": 1.9999649514282052e-05, "loss": 0.0058, "num_tokens": 4484158.0, "reward": 0.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.10293303818483675, "frac_reward_zero_std": 1.0, "grad_norm": 0.1630859375, "kl": 0.11074914317578077, "learning_rate": 1.9999593519901727e-05, "loss": 0.0044, "num_tokens": 4490469.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.10311750599520383, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.1158194076269865, "learning_rate": 1.999953337790594e-05, "loss": 0.0046, "num_tokens": 4495124.0, "reward": 1.1583333015441895, "reward_std": 0.3632962107658386, "rewards/fixed_code_pass_all_test_reward/mean": 0.28333336114883423, "rewards/fixed_code_pass_all_test_reward/std": 0.0471404492855072, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.10330197380557093, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.06439644796773791, "learning_rate": 1.9999469088319637e-05, "loss": 0.0026, "num_tokens": 4501641.0, "reward": 0.9196428656578064, "reward_std": 1.0054062604904175, "rewards/fixed_code_pass_all_test_reward/mean": 0.4196428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.10348644161593802, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.04872225783765316, "learning_rate": 1.9999400651169484e-05, "loss": 0.0019, "num_tokens": 4512096.0, "reward": 1.3416666984558105, "reward_std": 0.17251642048358917, "rewards/fixed_code_pass_all_test_reward/mean": 0.34166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.17251639068126678, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.10367090942630511, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "kl": 0.09062996134161949, "learning_rate": 1.9999328066483867e-05, "loss": 0.0036, "num_tokens": 4516074.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 112.5, "completions/mean_terminated_length": 112.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.1038553772366722, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.04644568148069084, "learning_rate": 1.999925133429289e-05, "loss": 0.0019, "num_tokens": 4524022.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 736.75, "completions/mean_terminated_length": 736.75, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.1040398450470393, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.018395556253381073, "learning_rate": 1.999917045462838e-05, "loss": 0.0007, "num_tokens": 4540452.0, "reward": 1.3541666269302368, "reward_std": 0.7889992594718933, "rewards/fixed_code_pass_all_test_reward/mean": 0.6041666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.33258846402168274, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.10422431285740638, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.04401056398637593, "learning_rate": 1.999908542752389e-05, "loss": 0.0018, "num_tokens": 4546142.0, "reward": 1.3229166269302368, "reward_std": 0.5781015157699585, "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.36307087540626526, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 586.5, "completions/mean_terminated_length": 586.5, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.10440878066777347, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.022979808622039855, "learning_rate": 1.9998996253014683e-05, "loss": 0.0009, "num_tokens": 4561394.0, "reward": 1.375, "reward_std": 0.7905694246292114, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.4432026445865631, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.10459324847814057, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06566903507336974, "learning_rate": 1.9998902931137747e-05, "loss": 0.0026, "num_tokens": 4566892.0, "reward": 1.9305555820465088, "reward_std": 0.19641853868961334, "rewards/fixed_code_pass_all_test_reward/mean": 0.9305555820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.19641855359077454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.10477771628850766, "frac_reward_zero_std": 0.0, "grad_norm": 3.078125, "kl": 0.07397379027679563, "learning_rate": 1.999880546193179e-05, "loss": 0.003, "num_tokens": 4570929.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 279.125, "completions/mean_terminated_length": 279.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.10496218409887474, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06306913774460554, "learning_rate": 1.9998703845437236e-05, "loss": 0.0025, "num_tokens": 4577538.0, "reward": 1.6721014976501465, "reward_std": 0.6919734477996826, "rewards/fixed_code_pass_all_test_reward/mean": 0.7971014976501465, "rewards/fixed_code_pass_all_test_reward/std": 0.35508301854133606, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 225.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.10514665190924184, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.05449487408623099, "learning_rate": 1.9998598081696243e-05, "loss": 0.0022, "num_tokens": 4585915.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.10533111971960893, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "kl": 0.07273028511554003, "learning_rate": 1.9998488170752673e-05, "loss": 0.0029, "num_tokens": 4590700.0, "reward": 0.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 316.375, "completions/mean_terminated_length": 316.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.10551558752997602, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.07011010241694748, "learning_rate": 1.9998374112652116e-05, "loss": 0.0028, "num_tokens": 4596095.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 283.25, "completions/mean_terminated_length": 283.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.10570005534034312, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0585760329850018, "learning_rate": 1.9998255907441878e-05, "loss": 0.0023, "num_tokens": 4607649.0, "reward": 1.2750000953674316, "reward_std": 0.3011881113052368, "rewards/fixed_code_pass_all_test_reward/mean": 0.2750000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.3011881709098816, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 139.625, "completions/mean_terminated_length": 139.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1058845231507102, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.06481848517432809, "learning_rate": 1.9998133555170992e-05, "loss": 0.0026, "num_tokens": 4614286.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.10606899096107729, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.06568572018295527, "learning_rate": 1.99980070558902e-05, "loss": 0.0026, "num_tokens": 4623963.0, "reward": 1.4107142686843872, "reward_std": 0.696112334728241, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.4544350206851959, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 91.25, "completions/mean_terminated_length": 91.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.10625345877144439, "frac_reward_zero_std": 0.0, "grad_norm": 4.625, "kl": 0.1260861549526453, "learning_rate": 1.9997876409651986e-05, "loss": 0.005, "num_tokens": 4628141.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 550.625, "completions/mean_terminated_length": 550.625, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.10643792658181148, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.027113683696370572, "learning_rate": 1.9997741616510527e-05, "loss": 0.0011, "num_tokens": 4642842.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.10662239439217856, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "kl": 0.08218135265633464, "learning_rate": 1.9997602676521734e-05, "loss": 0.0033, "num_tokens": 4650125.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 311.5, "completions/mean_terminated_length": 311.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.10680686220254565, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.05044455383904278, "learning_rate": 1.9997459589743234e-05, "loss": 0.002, "num_tokens": 4660153.0, "reward": 1.0, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 340.5, "completions/mean_terminated_length": 340.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.10699133001291275, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.08173818700015545, "learning_rate": 1.9997312356234385e-05, "loss": 0.0033, "num_tokens": 4669357.0, "reward": 1.3040540218353271, "reward_std": 0.8384914994239807, "rewards/fixed_code_pass_all_test_reward/mean": 0.5540540814399719, "rewards/fixed_code_pass_all_test_reward/std": 0.4506305754184723, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 307.125, "completions/mean_terminated_length": 307.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.10717579782327984, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.08221329469233751, "learning_rate": 1.999716097605625e-05, "loss": 0.0033, "num_tokens": 4679054.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 263.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.10736026563364692, "frac_reward_zero_std": 1.0, "grad_norm": 0.322265625, "kl": 0.077641187235713, "learning_rate": 1.999700544927162e-05, "loss": 0.0031, "num_tokens": 4685076.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.10754473344401402, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.09640992386266589, "learning_rate": 1.9996845775945e-05, "loss": 0.0039, "num_tokens": 4692120.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 492.0, "completions/mean_terminated_length": 492.0, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.10772920125438111, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.0501486852299422, "learning_rate": 1.9996681956142628e-05, "loss": 0.002, "num_tokens": 4704784.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.1079136690647482, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.09149862779304385, "learning_rate": 1.9996513989932443e-05, "loss": 0.0037, "num_tokens": 4710864.0, "reward": 1.1724998950958252, "reward_std": 0.6182868480682373, "rewards/fixed_code_pass_all_test_reward/mean": 0.29750001430511475, "rewards/fixed_code_pass_all_test_reward/std": 0.4352257251739502, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.1080981368751153, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06336728483438492, "learning_rate": 1.999634187738412e-05, "loss": 0.0025, "num_tokens": 4719044.0, "reward": 1.1875, "reward_std": 0.33615803718566895, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.33615806698799133, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 288.625, "completions/mean_terminated_length": 288.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.10828260468548238, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.09191461559385061, "learning_rate": 1.9996165618569047e-05, "loss": 0.0037, "num_tokens": 4729273.0, "reward": 1.6199324131011963, "reward_std": 0.4317343235015869, "rewards/fixed_code_pass_all_test_reward/mean": 0.7449324131011963, "rewards/fixed_code_pass_all_test_reward/std": 0.3611232340335846, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.10846707249584947, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.08518279995769262, "learning_rate": 1.999598521356033e-05, "loss": 0.0034, "num_tokens": 4738828.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 391.75, "completions/mean_terminated_length": 391.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.10865154030621657, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.06143583403900266, "learning_rate": 1.99958006624328e-05, "loss": 0.0025, "num_tokens": 4747090.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.10883600811658366, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.11042730882763863, "learning_rate": 1.9995611965262998e-05, "loss": 0.0044, "num_tokens": 4751787.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 474.75, "completions/mean_terminated_length": 474.75, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.10902047592695074, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.05499167647212744, "learning_rate": 1.9995419122129202e-05, "loss": 0.0022, "num_tokens": 4761737.0, "reward": 1.5416666269302368, "reward_std": 0.40472298860549927, "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.40472301840782166, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.10920494373731784, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.09635504335165024, "learning_rate": 1.999522213311139e-05, "loss": 0.0039, "num_tokens": 4767415.0, "reward": 1.01630437374115, "reward_std": 0.046115659177303314, "rewards/fixed_code_pass_all_test_reward/mean": 0.016304347664117813, "rewards/fixed_code_pass_all_test_reward/std": 0.04611566290259361, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.10938941154768493, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.08177581150084734, "learning_rate": 1.9995020998291275e-05, "loss": 0.0033, "num_tokens": 4775826.0, "reward": 1.5, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.10957387935805202, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.1140624824911356, "learning_rate": 1.9994815717752282e-05, "loss": 0.0046, "num_tokens": 4781527.0, "reward": 1.1100001335144043, "reward_std": 0.09133928269147873, "rewards/fixed_code_pass_all_test_reward/mean": 0.10999999940395355, "rewards/fixed_code_pass_all_test_reward/std": 0.09133924543857574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1097583471684191, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.09699875302612782, "learning_rate": 1.9994606291579552e-05, "loss": 0.0039, "num_tokens": 4791601.0, "reward": 1.1399999856948853, "reward_std": 0.2500285804271698, "rewards/fixed_code_pass_all_test_reward/mean": 0.14000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.2500285804271698, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.1099428149787862, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.11417353991419077, "learning_rate": 1.9994392719859953e-05, "loss": 0.0046, "num_tokens": 4799071.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.11012728278915329, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.06565936980769038, "learning_rate": 1.9994175002682076e-05, "loss": 0.0026, "num_tokens": 4803485.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.11031175059952038, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.11164399329572916, "learning_rate": 1.999395314013622e-05, "loss": 0.0045, "num_tokens": 4807718.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.11049621840988748, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.12892050947993994, "learning_rate": 1.999372713231441e-05, "loss": 0.0052, "num_tokens": 4813470.0, "reward": 1.0132979154586792, "reward_std": 0.44759485125541687, "rewards/fixed_code_pass_all_test_reward/mean": 0.13829787075519562, "rewards/fixed_code_pass_all_test_reward/std": 0.1892814189195633, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 483.125, "completions/mean_terminated_length": 483.125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.11068068622025456, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.06861923448741436, "learning_rate": 1.9993496979310386e-05, "loss": 0.0027, "num_tokens": 4828423.0, "reward": 0.8214285373687744, "reward_std": 0.6904175281524658, "rewards/fixed_code_pass_all_test_reward/mean": 0.1964285671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.34940600395202637, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 154.5, "completions/mean_terminated_length": 154.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.11086515403062165, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.12961936835199594, "learning_rate": 1.9993262681219614e-05, "loss": 0.0052, "num_tokens": 4833931.0, "reward": 1.1538461446762085, "reward_std": 0.5756396055221558, "rewards/fixed_code_pass_all_test_reward/mean": 0.2788461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.3559362590312958, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.11104962184098875, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.19090850837528706, "learning_rate": 1.9993024238139273e-05, "loss": 0.0076, "num_tokens": 4842422.0, "reward": 1.547029733657837, "reward_std": 0.49966028332710266, "rewards/fixed_code_pass_all_test_reward/mean": 0.5470297336578369, "rewards/fixed_code_pass_all_test_reward/std": 0.49966031312942505, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.11123408965135584, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.1272475915029645, "learning_rate": 1.9992781650168268e-05, "loss": 0.0051, "num_tokens": 4848678.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 219.5, "completions/mean_terminated_length": 219.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.11141855746172293, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.16565214097499847, "learning_rate": 1.999253491740722e-05, "loss": 0.0066, "num_tokens": 4858290.0, "reward": 1.6841216087341309, "reward_std": 0.17707158625125885, "rewards/fixed_code_pass_all_test_reward/mean": 0.6841216087341309, "rewards/fixed_code_pass_all_test_reward/std": 0.17707160115242004, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 408.125, "completions/mean_terminated_length": 408.125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.11160302527209003, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.06693266052752733, "learning_rate": 1.9992284039958462e-05, "loss": 0.0027, "num_tokens": 4867259.0, "reward": 1.46875, "reward_std": 0.6298423409461975, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.35477888584136963, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 316.875, "completions/mean_terminated_length": 316.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.11178749308245711, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.05718480609357357, "learning_rate": 1.9992029017926057e-05, "loss": 0.0023, "num_tokens": 4877122.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 113.0, "completions/mean_terminated_length": 113.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.1119719608928242, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.12263566348701715, "learning_rate": 1.999176985141578e-05, "loss": 0.0049, "num_tokens": 4881802.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 375.375, "completions/mean_terminated_length": 375.375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.1121564287031913, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.13403827045112848, "learning_rate": 1.9991506540535132e-05, "loss": 0.0054, "num_tokens": 4894861.0, "reward": 1.6647727489471436, "reward_std": 0.6469151377677917, "rewards/fixed_code_pass_all_test_reward/mean": 0.7897727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.3984455466270447, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.11234089651355839, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.12798434030264616, "learning_rate": 1.9991239085393324e-05, "loss": 0.0051, "num_tokens": 4903612.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 379.75, "completions/mean_terminated_length": 379.75, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.11252536432392547, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.04484792961739004, "learning_rate": 1.9990967486101297e-05, "loss": 0.0018, "num_tokens": 4913970.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 375.375, "completions/mean_terminated_length": 375.375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.11270983213429256, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.0693159019574523, "learning_rate": 1.9990691742771696e-05, "loss": 0.0028, "num_tokens": 4922397.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.11289429994465966, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.13098814245313406, "learning_rate": 1.99904118555189e-05, "loss": 0.0052, "num_tokens": 4931310.0, "reward": 1.1477272510528564, "reward_std": 0.5616726875305176, "rewards/fixed_code_pass_all_test_reward/mean": 0.39772728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.28584954142570496, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 358.25, "completions/mean_terminated_length": 358.25, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.11307876775502675, "frac_reward_zero_std": 1.0, "grad_norm": 0.18359375, "kl": 0.05773609993048012, "learning_rate": 1.9990127824458998e-05, "loss": 0.0023, "num_tokens": 4944192.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.11326323556539383, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "kl": 0.17470278963446617, "learning_rate": 1.9989839649709798e-05, "loss": 0.007, "num_tokens": 4951985.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 155.625, "completions/mean_terminated_length": 155.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.11344770337576093, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.12905095191672444, "learning_rate": 1.998954733139083e-05, "loss": 0.0052, "num_tokens": 4957446.0, "reward": 1.2664835453033447, "reward_std": 0.24195440113544464, "rewards/fixed_code_pass_all_test_reward/mean": 0.2664835453033447, "rewards/fixed_code_pass_all_test_reward/std": 0.24195440113544464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.11363217118612802, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.1004163371399045, "learning_rate": 1.998925086962334e-05, "loss": 0.004, "num_tokens": 4965320.0, "reward": 1.19921875, "reward_std": 0.32687368988990784, "rewards/fixed_code_pass_all_test_reward/mean": 0.19921875, "rewards/fixed_code_pass_all_test_reward/std": 0.32687368988990784, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1138166389964951, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06108594732359052, "learning_rate": 1.99889502645303e-05, "loss": 0.0024, "num_tokens": 4973988.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 309.5, "completions/mean_terminated_length": 309.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.1140011068068622, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.07262592762708664, "learning_rate": 1.998864551623639e-05, "loss": 0.0029, "num_tokens": 4984272.0, "reward": 1.2699999809265137, "reward_std": 0.45090386271476746, "rewards/fixed_code_pass_all_test_reward/mean": 0.26999998092651367, "rewards/fixed_code_pass_all_test_reward/std": 0.45090389251708984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1141855746172293, "frac_reward_zero_std": 0.0, "grad_norm": 3.203125, "kl": 0.15129907615482807, "learning_rate": 1.998833662486801e-05, "loss": 0.0061, "num_tokens": 4993045.0, "reward": 1.0234375, "reward_std": 0.29493018984794617, "rewards/fixed_code_pass_all_test_reward/mean": 0.1484375, "rewards/fixed_code_pass_all_test_reward/std": 0.13336937129497528, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.11437004242759638, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.08227353077381849, "learning_rate": 1.9988023590553287e-05, "loss": 0.0033, "num_tokens": 5004104.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 131.875, "completions/mean_terminated_length": 131.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.11455451023796348, "frac_reward_zero_std": 0.0, "grad_norm": 3.71875, "kl": 0.11879996210336685, "learning_rate": 1.998770641342206e-05, "loss": 0.0048, "num_tokens": 5007895.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.11473897804833057, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.11482868017628789, "learning_rate": 1.9987385093605886e-05, "loss": 0.0046, "num_tokens": 5015491.0, "reward": 1.1875, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.11492344585869765, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.0824491074308753, "learning_rate": 1.9987059631238038e-05, "loss": 0.0033, "num_tokens": 5019201.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.11510791366906475, "frac_reward_zero_std": 1.0, "grad_norm": 0.287109375, "kl": 0.11371715925633907, "learning_rate": 1.9986730026453515e-05, "loss": 0.0045, "num_tokens": 5026523.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 627.75, "completions/mean_terminated_length": 627.75, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.11529238147943184, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.034372989321127534, "learning_rate": 1.9986396279389028e-05, "loss": 0.0014, "num_tokens": 5042625.0, "reward": 0.9907407760620117, "reward_std": 0.40896767377853394, "rewards/fixed_code_pass_all_test_reward/mean": 0.11574073880910873, "rewards/fixed_code_pass_all_test_reward/std": 0.09584243595600128, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 393.0, "completions/mean_terminated_length": 393.0, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.11547684928979893, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.03842742653796449, "learning_rate": 1.9986058390183013e-05, "loss": 0.0015, "num_tokens": 5052785.0, "reward": 1.25, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.11566131710016603, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.09160548308864236, "learning_rate": 1.998571635897561e-05, "loss": 0.0037, "num_tokens": 5060781.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 91.5, "completions/mean_terminated_length": 91.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.11584578491053311, "frac_reward_zero_std": 0.0, "grad_norm": 4.59375, "kl": 0.1627300502732396, "learning_rate": 1.9985370185908693e-05, "loss": 0.0065, "num_tokens": 5064153.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 475.375, "completions/mean_terminated_length": 475.375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.1160302527209002, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.037203476298600435, "learning_rate": 1.998501987112585e-05, "loss": 0.0015, "num_tokens": 5074820.0, "reward": 0.9791666269302368, "reward_std": 0.33850160241127014, "rewards/fixed_code_pass_all_test_reward/mean": 0.1041666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.08625819534063339, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.11621472053126729, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.12457197811454535, "learning_rate": 1.9984665414772376e-05, "loss": 0.005, "num_tokens": 5078940.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.11639918834163439, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0807022606022656, "learning_rate": 1.9984306816995293e-05, "loss": 0.0032, "num_tokens": 5086315.0, "reward": 1.4801137447357178, "reward_std": 0.664214015007019, "rewards/fixed_code_pass_all_test_reward/mean": 0.6051136255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.37853753566741943, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.11658365615200147, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0841572261415422, "learning_rate": 1.9983944077943346e-05, "loss": 0.0034, "num_tokens": 5091415.0, "reward": 1.134615421295166, "reward_std": 0.054392822086811066, "rewards/fixed_code_pass_all_test_reward/mean": 0.13461539149284363, "rewards/fixed_code_pass_all_test_reward/std": 0.05439283326268196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.11676812396236856, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05372287845239043, "learning_rate": 1.9983577197766984e-05, "loss": 0.0021, "num_tokens": 5097132.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.11695259177273566, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.03589125070720911, "learning_rate": 1.998320617661839e-05, "loss": 0.0014, "num_tokens": 5108082.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.11713705958310275, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.07464974513277411, "learning_rate": 1.9982831014651443e-05, "loss": 0.003, "num_tokens": 5117657.0, "reward": 1.228124976158142, "reward_std": 0.48613590002059937, "rewards/fixed_code_pass_all_test_reward/mean": 0.3531250059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 482.0, "completions/mean_terminated_length": 482.0, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.11732152739346983, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.046311456710100174, "learning_rate": 1.998245171202176e-05, "loss": 0.0019, "num_tokens": 5130617.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 486.25, "completions/mean_terminated_length": 486.25, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.11750599520383694, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.043529008398763835, "learning_rate": 1.998206826888667e-05, "loss": 0.0017, "num_tokens": 5143883.0, "reward": 0.692307710647583, "reward_std": 0.584384024143219, "rewards/fixed_code_pass_all_test_reward/mean": 0.06730769574642181, "rewards/fixed_code_pass_all_test_reward/std": 0.12631389498710632, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.11769046301420402, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.05568939307704568, "learning_rate": 1.998168068540521e-05, "loss": 0.0022, "num_tokens": 5149595.0, "reward": 1.8888888359069824, "reward_std": 0.20573778450489044, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.11787493082457111, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.06522013572975993, "learning_rate": 1.9981288961738147e-05, "loss": 0.0026, "num_tokens": 5160524.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.11805939863493821, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.053259233478456736, "learning_rate": 1.9980893098047954e-05, "loss": 0.0021, "num_tokens": 5165907.0, "reward": 1.6607142686843872, "reward_std": 0.43824127316474915, "rewards/fixed_code_pass_all_test_reward/mean": 0.6607142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.43824127316474915, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1182438664453053, "frac_reward_zero_std": 0.0, "grad_norm": 3.21875, "kl": 0.08381124678999186, "learning_rate": 1.9980493094498835e-05, "loss": 0.0034, "num_tokens": 5170363.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 177.625, "completions/mean_terminated_length": 177.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.11842833425567238, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.11230818321928382, "learning_rate": 1.9980088951256695e-05, "loss": 0.0045, "num_tokens": 5179736.0, "reward": 1.524999976158142, "reward_std": 0.40620192885398865, "rewards/fixed_code_pass_all_test_reward/mean": 0.5249999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.40620192885398865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 472.625, "completions/mean_terminated_length": 472.625, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.11861280206603948, "frac_reward_zero_std": 1.0, "grad_norm": 0.1748046875, "kl": 0.040467410115525126, "learning_rate": 1.9979680668489166e-05, "loss": 0.0016, "num_tokens": 5191773.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.11879726987640657, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.08629681449383497, "learning_rate": 1.9979268246365597e-05, "loss": 0.0035, "num_tokens": 5197093.0, "reward": 1.375, "reward_std": 0.6799276471138, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.44095855951309204, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.11898173768677366, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.10201717633754015, "learning_rate": 1.9978851685057048e-05, "loss": 0.0041, "num_tokens": 5202838.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 95.125, "completions/mean_terminated_length": 95.125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.11916620549714074, "frac_reward_zero_std": 1.0, "grad_norm": 0.326171875, "kl": 0.09894145838916302, "learning_rate": 1.9978430984736303e-05, "loss": 0.004, "num_tokens": 5206375.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.11935067330750784, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.07876555575057864, "learning_rate": 1.997800614557786e-05, "loss": 0.0032, "num_tokens": 5212154.0, "reward": 1.26630437374115, "reward_std": 0.1513955444097519, "rewards/fixed_code_pass_all_test_reward/mean": 0.2663043439388275, "rewards/fixed_code_pass_all_test_reward/std": 0.1513955295085907, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 421.75, "completions/mean_terminated_length": 421.75, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.11953514111787493, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.056640959810465574, "learning_rate": 1.997757716775793e-05, "loss": 0.0023, "num_tokens": 5221368.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 112.375, "completions/mean_terminated_length": 112.375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.11971960892824202, "frac_reward_zero_std": 0.0, "grad_norm": 8.375, "kl": 0.49097564816474915, "learning_rate": 1.9977144051454445e-05, "loss": 0.0196, "num_tokens": 5224979.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 279.125, "completions/mean_terminated_length": 279.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.11990407673860912, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.04865136346779764, "learning_rate": 1.9976706796847053e-05, "loss": 0.0019, "num_tokens": 5235148.0, "reward": 1.170454502105713, "reward_std": 0.14114977419376373, "rewards/fixed_code_pass_all_test_reward/mean": 0.17045456171035767, "rewards/fixed_code_pass_all_test_reward/std": 0.14114978909492493, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.1200885445489762, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.0578660168685019, "learning_rate": 1.9976265404117117e-05, "loss": 0.0023, "num_tokens": 5243261.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.12027301235934329, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.06887826370075345, "learning_rate": 1.997581987344772e-05, "loss": 0.0028, "num_tokens": 5249681.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 257.625, "completions/mean_terminated_length": 257.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.12045748016971039, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.06329380068928003, "learning_rate": 1.997537020502365e-05, "loss": 0.0025, "num_tokens": 5257190.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.12064194798007748, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.07342360122129321, "learning_rate": 1.997491639903143e-05, "loss": 0.0029, "num_tokens": 5263761.0, "reward": 1.4249999523162842, "reward_std": 0.3882193863391876, "rewards/fixed_code_pass_all_test_reward/mean": 0.42500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.38821941614151, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 101.25, "completions/mean_terminated_length": 101.25, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.12082641579044456, "frac_reward_zero_std": 1.0, "grad_norm": 0.330078125, "kl": 0.13406100124120712, "learning_rate": 1.9974458455659282e-05, "loss": 0.0054, "num_tokens": 5267283.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.12101088360081166, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.07318445853888988, "learning_rate": 1.9973996375097154e-05, "loss": 0.0029, "num_tokens": 5277246.0, "reward": 0.90625, "reward_std": 0.48065242171287537, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.21564547717571259, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.12119535141117875, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.06780564598739147, "learning_rate": 1.997353015753671e-05, "loss": 0.0027, "num_tokens": 5285088.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 95.875, "completions/mean_terminated_length": 95.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.12137981922154584, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.11693045683205128, "learning_rate": 1.997305980317132e-05, "loss": 0.0047, "num_tokens": 5290487.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 408.625, "completions/mean_terminated_length": 408.625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.12156428703191294, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.07172954361885786, "learning_rate": 1.997258531219608e-05, "loss": 0.0029, "num_tokens": 5298868.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.12174875484228002, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.09596420871093869, "learning_rate": 1.9972106684807802e-05, "loss": 0.0038, "num_tokens": 5306547.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.12193322265264711, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.127956235781312, "learning_rate": 1.9971623921205007e-05, "loss": 0.0051, "num_tokens": 5313000.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1221176904630142, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "kl": 0.15484413783997297, "learning_rate": 1.9971137021587933e-05, "loss": 0.0062, "num_tokens": 5321371.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.1223021582733813, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.11579650989733636, "learning_rate": 1.997064598615854e-05, "loss": 0.0046, "num_tokens": 5328137.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.12248662608374838, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.05081452988088131, "learning_rate": 1.9970150815120494e-05, "loss": 0.002, "num_tokens": 5337367.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.12267109389411547, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.07257914822548628, "learning_rate": 1.9969651508679188e-05, "loss": 0.0029, "num_tokens": 5348981.0, "reward": 1.4354839324951172, "reward_std": 0.49525773525238037, "rewards/fixed_code_pass_all_test_reward/mean": 0.4354838728904724, "rewards/fixed_code_pass_all_test_reward/std": 0.49525773525238037, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.12285556170448257, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.06944853253662586, "learning_rate": 1.9969148067041715e-05, "loss": 0.0028, "num_tokens": 5354829.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 90.75, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.12304002951484966, "frac_reward_zero_std": 0.0, "grad_norm": 3.484375, "kl": 0.08239260874688625, "learning_rate": 1.99686404904169e-05, "loss": 0.0033, "num_tokens": 5358379.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.12322449732521674, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.08544948953203857, "learning_rate": 1.996812877901527e-05, "loss": 0.0034, "num_tokens": 5363457.0, "reward": 1.796875, "reward_std": 0.5745242834091187, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.12340896513558385, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.10545390099287033, "learning_rate": 1.996761293304907e-05, "loss": 0.0042, "num_tokens": 5368227.0, "reward": 1.9083333015441895, "reward_std": 0.2592725157737732, "rewards/fixed_code_pass_all_test_reward/mean": 0.9083333015441895, "rewards/fixed_code_pass_all_test_reward/std": 0.2592725157737732, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 96.375, "completions/mean_terminated_length": 96.375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.12359343294595093, "frac_reward_zero_std": 1.0, "grad_norm": 0.392578125, "kl": 0.1580355940386653, "learning_rate": 1.9967092952732266e-05, "loss": 0.0063, "num_tokens": 5371694.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 451.875, "completions/mean_terminated_length": 451.875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.12377790075631802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.05688100145198405, "learning_rate": 1.9966568838280534e-05, "loss": 0.0023, "num_tokens": 5381037.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.12396236856668512, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.09942095819860697, "learning_rate": 1.9966040589911266e-05, "loss": 0.004, "num_tokens": 5386441.0, "reward": 1.625, "reward_std": 0.16732670366764069, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.16732673346996307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.1241468363770522, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.06828771019354463, "learning_rate": 1.996550820784356e-05, "loss": 0.0027, "num_tokens": 5394268.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.12433130418741929, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0882972190156579, "learning_rate": 1.996497169229825e-05, "loss": 0.0035, "num_tokens": 5399721.0, "reward": 1.1301020383834839, "reward_std": 0.3142625689506531, "rewards/fixed_code_pass_all_test_reward/mean": 0.1301020383834839, "rewards/fixed_code_pass_all_test_reward/std": 0.3142625093460083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 237.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.12451577199778639, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.07540575554594398, "learning_rate": 1.9964431043497862e-05, "loss": 0.003, "num_tokens": 5405932.0, "reward": 1.3809523582458496, "reward_std": 0.30009177327156067, "rewards/fixed_code_pass_all_test_reward/mean": 0.5059523582458496, "rewards/fixed_code_pass_all_test_reward/std": 0.1567765474319458, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.12470023980815348, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.08324785390868783, "learning_rate": 1.9963886261666647e-05, "loss": 0.0033, "num_tokens": 5413591.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 247.625, "completions/mean_terminated_length": 247.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.12488470761852057, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06444656546227634, "learning_rate": 1.9963337347030568e-05, "loss": 0.0026, "num_tokens": 5419212.0, "reward": 1.53125, "reward_std": 0.5033370852470398, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.405046284198761, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.12506917542888765, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.08405112288892269, "learning_rate": 1.9962784299817308e-05, "loss": 0.0034, "num_tokens": 5426799.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 145.125, "completions/mean_terminated_length": 145.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.12525364323925475, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.10574832372367382, "learning_rate": 1.9962227120256253e-05, "loss": 0.0042, "num_tokens": 5431376.0, "reward": 1.75, "reward_std": 0.09258195757865906, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.12543811104962185, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.11608895286917686, "learning_rate": 1.9961665808578512e-05, "loss": 0.0046, "num_tokens": 5439074.0, "reward": 1.875, "reward_std": 0.1725163757801056, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.17251639068126678, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.12562257885998893, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.08875698456540704, "learning_rate": 1.9961100365016903e-05, "loss": 0.0036, "num_tokens": 5444374.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.12580704667035603, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.13125556334853172, "learning_rate": 1.9960530789805963e-05, "loss": 0.0053, "num_tokens": 5448842.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 430.0, "completions/mean_terminated_length": 430.0, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.12599151448072313, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.036476825596764684, "learning_rate": 1.9959957083181937e-05, "loss": 0.0015, "num_tokens": 5461658.0, "reward": 1.0370370149612427, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.03703703731298447, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1261759822910902, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.0898973923176527, "learning_rate": 1.9959379245382785e-05, "loss": 0.0036, "num_tokens": 5468203.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.1263604501014573, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.04375476622954011, "learning_rate": 1.995879727664819e-05, "loss": 0.0018, "num_tokens": 5475910.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 363.625, "completions/mean_terminated_length": 363.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.12654491791182437, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0609865584410727, "learning_rate": 1.9958211177219528e-05, "loss": 0.0024, "num_tokens": 5483627.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.12672938572219147, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.14879112876951694, "learning_rate": 1.9957620947339905e-05, "loss": 0.006, "num_tokens": 5490597.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 276.25, "completions/mean_terminated_length": 276.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.12691385353255857, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.08415928995236754, "learning_rate": 1.9957026587254136e-05, "loss": 0.0034, "num_tokens": 5495671.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.12709832134292565, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.05760736856609583, "learning_rate": 1.995642809720875e-05, "loss": 0.0023, "num_tokens": 5507956.0, "reward": 1.7234848737716675, "reward_std": 0.06629171222448349, "rewards/fixed_code_pass_all_test_reward/mean": 0.7234848737716675, "rewards/fixed_code_pass_all_test_reward/std": 0.06629174947738647, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.12728278915329275, "frac_reward_zero_std": 1.0, "grad_norm": 1.03125, "kl": 0.12517809309065342, "learning_rate": 1.995582547745199e-05, "loss": 0.005, "num_tokens": 5515491.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1176.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 610.625, "completions/mean_terminated_length": 610.625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.12746725696365985, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.05356838763691485, "learning_rate": 1.9955218728233806e-05, "loss": 0.0021, "num_tokens": 5525760.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.12765172477402692, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.13814793527126312, "learning_rate": 1.995460784980586e-05, "loss": 0.0055, "num_tokens": 5529657.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.12783619258439402, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.0596997644752264, "learning_rate": 1.9953992842421545e-05, "loss": 0.0024, "num_tokens": 5534538.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.12802066039476112, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.14837194047868252, "learning_rate": 1.9953373706335934e-05, "loss": 0.0059, "num_tokens": 5543012.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1282051282051282, "frac_reward_zero_std": 1.0, "grad_norm": 0.8203125, "kl": 0.14228621125221252, "learning_rate": 1.995275044180585e-05, "loss": 0.0057, "num_tokens": 5552911.0, "reward": 1.633802890777588, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6338028311729431, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.1283895960154953, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.08018416864797473, "learning_rate": 1.9952123049089796e-05, "loss": 0.0032, "num_tokens": 5559511.0, "reward": 1.0640757083892822, "reward_std": 0.1812332719564438, "rewards/fixed_code_pass_all_test_reward/mean": 0.06407563388347626, "rewards/fixed_code_pass_all_test_reward/std": 0.1812332570552826, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 511.625, "completions/mean_terminated_length": 511.625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.1285740638258624, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.05372215295210481, "learning_rate": 1.9951491528448005e-05, "loss": 0.0021, "num_tokens": 5569844.0, "reward": 1.5833333730697632, "reward_std": 0.2357023060321808, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 487.375, "completions/mean_terminated_length": 487.375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.12875853163622947, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.05157871241681278, "learning_rate": 1.9950855880142424e-05, "loss": 0.0021, "num_tokens": 5579271.0, "reward": 1.7400000095367432, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7400000095367432, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.12894299944659657, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.13004507962614298, "learning_rate": 1.9950216104436697e-05, "loss": 0.0052, "num_tokens": 5583281.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 334.125, "completions/mean_terminated_length": 334.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.12912746725696367, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.10626918473280966, "learning_rate": 1.9949572201596196e-05, "loss": 0.0043, "num_tokens": 5593130.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.12931193506733074, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.0809661103412509, "learning_rate": 1.9948924171887993e-05, "loss": 0.0032, "num_tokens": 5601786.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 451.75, "completions/mean_terminated_length": 451.75, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.12949640287769784, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.09269754868000746, "learning_rate": 1.994827201558088e-05, "loss": 0.0037, "num_tokens": 5610024.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 383.125, "completions/mean_terminated_length": 383.125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.12968087068806494, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.1000274708494544, "learning_rate": 1.9947615732945358e-05, "loss": 0.004, "num_tokens": 5617193.0, "reward": 1.6505377292633057, "reward_std": 0.28938230872154236, "rewards/fixed_code_pass_all_test_reward/mean": 0.6505376696586609, "rewards/fixed_code_pass_all_test_reward/std": 0.28938233852386475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 387.0, "completions/mean_terminated_length": 387.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.12986533849843201, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.10710296267643571, "learning_rate": 1.9946955324253635e-05, "loss": 0.0043, "num_tokens": 5627929.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 297.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.13004980630879912, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.14005849976092577, "learning_rate": 1.9946290789779637e-05, "loss": 0.0056, "num_tokens": 5636869.0, "reward": 1.0225000381469727, "reward_std": 0.024928444996476173, "rewards/fixed_code_pass_all_test_reward/mean": 0.022499999031424522, "rewards/fixed_code_pass_all_test_reward/std": 0.024928469210863113, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 259.75, "completions/mean_terminated_length": 259.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.13023427411916622, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.0997302164323628, "learning_rate": 1.9945622129799e-05, "loss": 0.004, "num_tokens": 5645355.0, "reward": 1.5625, "reward_std": 0.21407300233840942, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.2140730321407318, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.1304187419295333, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.11492943298071623, "learning_rate": 1.994494934458907e-05, "loss": 0.0046, "num_tokens": 5656631.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.1306032097399004, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.13582306262105703, "learning_rate": 1.9944272434428896e-05, "loss": 0.0054, "num_tokens": 5662787.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 756.75, "completions/mean_terminated_length": 756.75, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.1307876775502675, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.0572455944493413, "learning_rate": 1.9943591399599254e-05, "loss": 0.0023, "num_tokens": 5675697.0, "reward": 0.8333332538604736, "reward_std": 0.5194624662399292, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.0890870913863182, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.13097214536063456, "frac_reward_zero_std": 0.0, "grad_norm": 3.9375, "kl": 0.3684293841943145, "learning_rate": 1.994290624038262e-05, "loss": 0.0147, "num_tokens": 5682735.0, "reward": 1.9832088947296143, "reward_std": 0.023173892870545387, "rewards/fixed_code_pass_all_test_reward/mean": 0.9832088947296143, "rewards/fixed_code_pass_all_test_reward/std": 0.023173855617642403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.13115661317100166, "frac_reward_zero_std": 1.0, "grad_norm": 0.240234375, "kl": 0.1040666839107871, "learning_rate": 1.9942216957063182e-05, "loss": 0.0042, "num_tokens": 5692239.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.13134108098136876, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.11894275899976492, "learning_rate": 1.9941523549926842e-05, "loss": 0.0048, "num_tokens": 5701028.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 145.875, "completions/mean_terminated_length": 145.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.13152554879173584, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.1872598184272647, "learning_rate": 1.994082601926121e-05, "loss": 0.0075, "num_tokens": 5710083.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.13171001660210294, "frac_reward_zero_std": 0.0, "grad_norm": 3.8125, "kl": 0.0665245356503874, "learning_rate": 1.9940124365355605e-05, "loss": 0.0027, "num_tokens": 5714029.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 112.375, "completions/mean_terminated_length": 112.375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.13189448441247004, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.16306263953447342, "learning_rate": 1.993941858850106e-05, "loss": 0.0065, "num_tokens": 5719624.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.1320789522228371, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.2008783621713519, "learning_rate": 1.9938708688990312e-05, "loss": 0.008, "num_tokens": 5728568.0, "reward": 0.9090908765792847, "reward_std": 0.3732495605945587, "rewards/fixed_code_pass_all_test_reward/mean": 0.034090910106897354, "rewards/fixed_code_pass_all_test_reward/std": 0.06763853132724762, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 92.25, "completions/mean_terminated_length": 92.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.1322634200332042, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.1945976186543703, "learning_rate": 1.9937994667117818e-05, "loss": 0.0078, "num_tokens": 5734714.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.1324478878435713, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.05102223623543978, "learning_rate": 1.9937276523179733e-05, "loss": 0.002, "num_tokens": 5742578.0, "reward": 1.7792208194732666, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7792207598686218, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 404.0, "completions/mean_terminated_length": 404.0, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.13263235565393838, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.04954998311586678, "learning_rate": 1.9936554257473932e-05, "loss": 0.002, "num_tokens": 5752042.0, "reward": 1.4583333730697632, "reward_std": 0.3053751587867737, "rewards/fixed_code_pass_all_test_reward/mean": 0.4583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3053751289844513, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.13281682346430548, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.1398388994857669, "learning_rate": 1.9935827870299994e-05, "loss": 0.0056, "num_tokens": 5759878.0, "reward": 0.9772727489471436, "reward_std": 0.3977504372596741, "rewards/fixed_code_pass_all_test_reward/mean": 0.10227273404598236, "rewards/fixed_code_pass_all_test_reward/std": 0.06312409788370132, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.13300129127467256, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.09609882393851876, "learning_rate": 1.9935097361959208e-05, "loss": 0.0038, "num_tokens": 5764402.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 113.125, "completions/mean_terminated_length": 113.125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.13318575908503966, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.19568195194005966, "learning_rate": 1.993436273275457e-05, "loss": 0.0078, "num_tokens": 5772691.0, "reward": 1.6796875, "reward_std": 0.6505385041236877, "rewards/fixed_code_pass_all_test_reward/mean": 0.8046875, "rewards/fixed_code_pass_all_test_reward/std": 0.36164847016334534, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.13337022689540676, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.12031527608633041, "learning_rate": 1.9933623982990796e-05, "loss": 0.0048, "num_tokens": 5780778.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 90.25, "completions/mean_terminated_length": 90.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.13355469470577383, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "kl": 0.20727140083909035, "learning_rate": 1.9932881112974298e-05, "loss": 0.0083, "num_tokens": 5784316.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.13373916251614093, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.07938179839402437, "learning_rate": 1.9932134123013204e-05, "loss": 0.0032, "num_tokens": 5789046.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.13392363032650803, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.07125245733186603, "learning_rate": 1.9931383013417345e-05, "loss": 0.0029, "num_tokens": 5798186.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 232.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.1341080981368751, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.13078368734568357, "learning_rate": 1.9930627784498272e-05, "loss": 0.0052, "num_tokens": 5807244.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 76.0, "completions/mean_terminated_length": 76.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.1342925659472422, "frac_reward_zero_std": 0.0, "grad_norm": 4.9375, "kl": 0.18241225741803646, "learning_rate": 1.9929868436569234e-05, "loss": 0.0073, "num_tokens": 5817628.0, "reward": 1.4567307233810425, "reward_std": 0.27074727416038513, "rewards/fixed_code_pass_all_test_reward/mean": 0.45673078298568726, "rewards/fixed_code_pass_all_test_reward/std": 0.2707473039627075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 511.25, "completions/mean_terminated_length": 291.71429443359375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1344770337576093, "frac_reward_zero_std": 0.0, "grad_norm": 0.703125, "kl": 0.07732725905952975, "learning_rate": 1.9929104969945193e-05, "loss": 0.0031, "num_tokens": 5828646.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 108.125, "completions/mean_terminated_length": 108.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.13466150156797638, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.20433220081031322, "learning_rate": 1.992833738494282e-05, "loss": 0.0082, "num_tokens": 5836335.0, "reward": 1.5449562072753906, "reward_std": 0.22019554674625397, "rewards/fixed_code_pass_all_test_reward/mean": 0.5449561476707458, "rewards/fixed_code_pass_all_test_reward/std": 0.22019553184509277, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 101.25, "completions/mean_terminated_length": 101.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.13484596937834348, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.09373309276998043, "learning_rate": 1.992756568188049e-05, "loss": 0.0037, "num_tokens": 5840145.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.13503043718871058, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.09777151234447956, "learning_rate": 1.9926789861078285e-05, "loss": 0.0039, "num_tokens": 5848299.0, "reward": 1.625, "reward_std": 0.16828222572803497, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.16828221082687378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.13521490499907765, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.14239140693098307, "learning_rate": 1.992600992285801e-05, "loss": 0.0057, "num_tokens": 5855923.0, "reward": 1.75, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.13539937280944475, "frac_reward_zero_std": 0.0, "grad_norm": 3.515625, "kl": 0.29236338404007256, "learning_rate": 1.992522586754315e-05, "loss": 0.0117, "num_tokens": 5870827.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 100.125, "completions/mean_terminated_length": 100.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.13558384061981185, "frac_reward_zero_std": 0.0, "grad_norm": 3.4375, "kl": 0.11744115501642227, "learning_rate": 1.9924437695458932e-05, "loss": 0.0047, "num_tokens": 5876612.0, "reward": 1.6875, "reward_std": 0.4403957426548004, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.4403957426548004, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.13576830843017892, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.15629747230559587, "learning_rate": 1.9923645406932263e-05, "loss": 0.0063, "num_tokens": 5883591.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 116.25, "completions/mean_terminated_length": 116.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.13595277624054602, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.14899690076708794, "learning_rate": 1.992284900229177e-05, "loss": 0.006, "num_tokens": 5890545.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 562.0, "completions/mean_terminated_length": 562.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.13613724405091313, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.06598125724121928, "learning_rate": 1.9922048481867785e-05, "loss": 0.0026, "num_tokens": 5905281.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 361.0, "completions/mean_terminated_length": 361.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.1363217118612802, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.05145368305966258, "learning_rate": 1.992124384599234e-05, "loss": 0.0021, "num_tokens": 5913321.0, "reward": 1.296875, "reward_std": 0.3991480767726898, "rewards/fixed_code_pass_all_test_reward/mean": 0.546875, "rewards/fixed_code_pass_all_test_reward/std": 0.33863896131515503, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 322.25, "completions/mean_terminated_length": 322.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.1365061796716473, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.06860226579010487, "learning_rate": 1.9920435094999186e-05, "loss": 0.0027, "num_tokens": 5920275.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1366906474820144, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.11104825139045715, "learning_rate": 1.991962222922378e-05, "loss": 0.0044, "num_tokens": 5928021.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 336.875, "completions/mean_terminated_length": 336.875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.13687511529238147, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.053951806388795376, "learning_rate": 1.9918805249003272e-05, "loss": 0.0022, "num_tokens": 5940796.0, "reward": 1.4659090042114258, "reward_std": 0.6947900056838989, "rewards/fixed_code_pass_all_test_reward/mean": 0.5909090638160706, "rewards/fixed_code_pass_all_test_reward/std": 0.44932061433792114, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 230.625, "completions/mean_terminated_length": 230.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.13705958310274857, "frac_reward_zero_std": 1.0, "grad_norm": 0.20703125, "kl": 0.053176770685240626, "learning_rate": 1.991798415467653e-05, "loss": 0.0021, "num_tokens": 5946441.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 437.625, "completions/mean_terminated_length": 437.625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.13724405091311567, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.049248069524765015, "learning_rate": 1.9917158946584135e-05, "loss": 0.002, "num_tokens": 5955318.0, "reward": 1.6071428060531616, "reward_std": 0.4302687346935272, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.36967799067497253, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.13742851872348275, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.13433187874034047, "learning_rate": 1.9916329625068353e-05, "loss": 0.0054, "num_tokens": 5959579.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 385.125, "completions/mean_terminated_length": 385.125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.13761298653384985, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.0735728912986815, "learning_rate": 1.9915496190473178e-05, "loss": 0.0029, "num_tokens": 5971180.0, "reward": 1.5833333730697632, "reward_std": 0.49601587653160095, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.49601587653160095, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.13779745434421695, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.04419311159290373, "learning_rate": 1.9914658643144293e-05, "loss": 0.0018, "num_tokens": 5976080.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.13798192215458402, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.05045714764855802, "learning_rate": 1.9913816983429104e-05, "loss": 0.002, "num_tokens": 5981458.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.13816638996495112, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.04685618868097663, "learning_rate": 1.9912971211676703e-05, "loss": 0.0019, "num_tokens": 5990083.0, "reward": 1.7291666269302368, "reward_std": 0.38768240809440613, "rewards/fixed_code_pass_all_test_reward/mean": 0.7291666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.38768237829208374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.13835085777531822, "frac_reward_zero_std": 1.0, "grad_norm": 0.283203125, "kl": 0.12362468149513006, "learning_rate": 1.9912121328237904e-05, "loss": 0.0049, "num_tokens": 5998200.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 860.0, "completions/mean_terminated_length": 860.0, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.1385353255856853, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.0437026449944824, "learning_rate": 1.991126733346522e-05, "loss": 0.0017, "num_tokens": 6015504.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 361.75, "completions/mean_terminated_length": 361.75, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.1387197933960524, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.09401163132861257, "learning_rate": 1.9910409227712864e-05, "loss": 0.0038, "num_tokens": 6028470.0, "reward": 1.5965908765792847, "reward_std": 0.43288490176200867, "rewards/fixed_code_pass_all_test_reward/mean": 0.5965908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.4328848719596863, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.13890426120641947, "frac_reward_zero_std": 1.0, "grad_norm": 0.265625, "kl": 0.09058415074832737, "learning_rate": 1.990954701133677e-05, "loss": 0.0036, "num_tokens": 6037405.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.13908872901678657, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.08996177837252617, "learning_rate": 1.990868068469456e-05, "loss": 0.0036, "num_tokens": 6044793.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.13927319682715367, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.07177273416891694, "learning_rate": 1.990781024814557e-05, "loss": 0.0029, "num_tokens": 6052509.0, "reward": 1.0546875, "reward_std": 0.3604893386363983, "rewards/fixed_code_pass_all_test_reward/mean": 0.1796875, "rewards/fixed_code_pass_all_test_reward/std": 0.08476267009973526, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.13945766463752074, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.11440308950841427, "learning_rate": 1.990693570205083e-05, "loss": 0.0046, "num_tokens": 6056703.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 116.75, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.13964213244788784, "frac_reward_zero_std": 1.0, "grad_norm": 0.189453125, "kl": 0.054297061171382666, "learning_rate": 1.9906057046773096e-05, "loss": 0.0022, "num_tokens": 6060621.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.13982660025825494, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.1038526650518179, "learning_rate": 1.9905174282676808e-05, "loss": 0.0042, "num_tokens": 6069955.0, "reward": 1.798076868057251, "reward_std": 0.1246296614408493, "rewards/fixed_code_pass_all_test_reward/mean": 0.798076868057251, "rewards/fixed_code_pass_all_test_reward/std": 0.12462963908910751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.140011068068622, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.09025816759094596, "learning_rate": 1.9904287410128118e-05, "loss": 0.0036, "num_tokens": 6075393.0, "reward": 1.2063679695129395, "reward_std": 0.5611749291419983, "rewards/fixed_code_pass_all_test_reward/mean": 0.33136793971061707, "rewards/fixed_code_pass_all_test_reward/std": 0.3086109757423401, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 89.375, "completions/mean_terminated_length": 89.375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.1401955358789891, "frac_reward_zero_std": 1.0, "grad_norm": 0.2138671875, "kl": 0.09036492370069027, "learning_rate": 1.9903396429494882e-05, "loss": 0.0036, "num_tokens": 6078924.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.14038000368935621, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.038181931246072054, "learning_rate": 1.9902501341146657e-05, "loss": 0.0015, "num_tokens": 6086394.0, "reward": 1.6770832538604736, "reward_std": 0.5048131346702576, "rewards/fixed_code_pass_all_test_reward/mean": 0.9270833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.10386862605810165, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 422.375, "completions/mean_terminated_length": 422.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.1405644714997233, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.0639816434122622, "learning_rate": 1.990160214545471e-05, "loss": 0.0026, "num_tokens": 6098781.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 361.625, "completions/mean_terminated_length": 361.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.1407489393100904, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.10329381981864572, "learning_rate": 1.9900698842792008e-05, "loss": 0.0041, "num_tokens": 6109618.0, "reward": 1.0681817531585693, "reward_std": 0.5113816261291504, "rewards/fixed_code_pass_all_test_reward/mean": 0.1931818276643753, "rewards/fixed_code_pass_all_test_reward/std": 0.3275521695613861, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.1409334071204575, "frac_reward_zero_std": 0.0, "grad_norm": 3.421875, "kl": 0.1260461751371622, "learning_rate": 1.9899791433533222e-05, "loss": 0.005, "num_tokens": 6113868.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.14111787493082456, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.05335004813969135, "learning_rate": 1.989887991805472e-05, "loss": 0.0021, "num_tokens": 6124099.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 481.75, "completions/mean_terminated_length": 481.75, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.14130234274119166, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.06591639807447791, "learning_rate": 1.9897964296734588e-05, "loss": 0.0026, "num_tokens": 6133673.0, "reward": 1.3888888359069824, "reward_std": 0.5074402093887329, "rewards/fixed_code_pass_all_test_reward/mean": 0.3888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.5074402689933777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 527.0, "completions/mean_terminated_length": 309.71429443359375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.14148681055155876, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.044343432353343815, "learning_rate": 1.9897044569952597e-05, "loss": 0.0018, "num_tokens": 6144713.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 244.25, "completions/mean_terminated_length": 244.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.14167127836192583, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.05630635062698275, "learning_rate": 1.989612073809023e-05, "loss": 0.0023, "num_tokens": 6151331.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 81.25, "completions/mean_terminated_length": 81.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.14185574617229293, "frac_reward_zero_std": 1.0, "grad_norm": 0.236328125, "kl": 0.08770613931119442, "learning_rate": 1.9895192801530687e-05, "loss": 0.0035, "num_tokens": 6156733.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.14204021398266004, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.04448249517008662, "learning_rate": 1.9894260760658837e-05, "loss": 0.0018, "num_tokens": 6162663.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 463.125, "completions/mean_terminated_length": 463.125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.1422246817930271, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0457404674962163, "learning_rate": 1.989332461586128e-05, "loss": 0.0018, "num_tokens": 6172432.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 207.625, "completions/mean_terminated_length": 207.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.1424091496033942, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.13429220113903284, "learning_rate": 1.989238436752631e-05, "loss": 0.0054, "num_tokens": 6178853.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 239.25, "completions/mean_terminated_length": 239.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1425936174137613, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.03281144821085036, "learning_rate": 1.9891440016043915e-05, "loss": 0.0013, "num_tokens": 6184151.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 134.625, "completions/mean_terminated_length": 134.625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.14277808522412838, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.14762013964354992, "learning_rate": 1.9890491561805797e-05, "loss": 0.0059, "num_tokens": 6188044.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 397.625, "completions/mean_terminated_length": 161.85714721679688, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.14296255303449548, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.0797728179168189, "learning_rate": 1.9889539005205356e-05, "loss": 0.0032, "num_tokens": 6198105.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.14314702084486258, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.06959982635453343, "learning_rate": 1.9888582346637685e-05, "loss": 0.0028, "num_tokens": 6206134.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.14333148865522966, "frac_reward_zero_std": 1.0, "grad_norm": 0.1630859375, "kl": 0.1023345747962594, "learning_rate": 1.9887621586499592e-05, "loss": 0.0041, "num_tokens": 6213822.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 216.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.14351595646559676, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.06966275861486793, "learning_rate": 1.9886656725189575e-05, "loss": 0.0028, "num_tokens": 6222422.0, "reward": 1.5833332538604736, "reward_std": 0.29005303978919983, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.29005300998687744, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 338.75, "completions/mean_terminated_length": 338.75, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.14370042427596386, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.07792132534086704, "learning_rate": 1.988568776310784e-05, "loss": 0.0031, "num_tokens": 6230108.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 244.25, "completions/mean_terminated_length": 244.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.14388489208633093, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.039573403424583375, "learning_rate": 1.9884714700656294e-05, "loss": 0.0016, "num_tokens": 6235830.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 232.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.14406935989669803, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.1070977128110826, "learning_rate": 1.9883737538238533e-05, "loss": 0.0043, "num_tokens": 6244376.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.14425382770706513, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.04745132359676063, "learning_rate": 1.9882756276259875e-05, "loss": 0.0019, "num_tokens": 6251752.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.1444382955174322, "frac_reward_zero_std": 1.0, "grad_norm": 0.58203125, "kl": 0.09943527425639331, "learning_rate": 1.9881770915127316e-05, "loss": 0.004, "num_tokens": 6258914.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1446227633277993, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.10889423871412873, "learning_rate": 1.988078145524957e-05, "loss": 0.0044, "num_tokens": 6265866.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 240.0, "completions/mean_terminated_length": 240.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1448072311381664, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.054102213587611914, "learning_rate": 1.987978789703704e-05, "loss": 0.0022, "num_tokens": 6275858.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 153.375, "completions/mean_terminated_length": 153.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.14499169894853348, "frac_reward_zero_std": 1.0, "grad_norm": 0.2255859375, "kl": 0.09046365087851882, "learning_rate": 1.9878790240901835e-05, "loss": 0.0036, "num_tokens": 6282317.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.14517616675890058, "frac_reward_zero_std": 1.0, "grad_norm": 0.3125, "kl": 0.06109621305949986, "learning_rate": 1.987778848725775e-05, "loss": 0.0024, "num_tokens": 6288060.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 132.125, "completions/mean_terminated_length": 132.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.14536063456926765, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.1191755635663867, "learning_rate": 1.987678263652031e-05, "loss": 0.0048, "num_tokens": 6291981.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.14554510237963475, "frac_reward_zero_std": 1.0, "grad_norm": 0.1669921875, "kl": 0.07236590096727014, "learning_rate": 1.9875772689106707e-05, "loss": 0.0029, "num_tokens": 6300433.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 228.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.14572957019000185, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.04280627076514065, "learning_rate": 1.9874758645435846e-05, "loss": 0.0017, "num_tokens": 6306633.0, "reward": 0.942307710647583, "reward_std": 0.4248216152191162, "rewards/fixed_code_pass_all_test_reward/mean": 0.06730769574642181, "rewards/fixed_code_pass_all_test_reward/std": 0.19037491083145142, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.14591403800036892, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.0808372930623591, "learning_rate": 1.9873740505928336e-05, "loss": 0.0032, "num_tokens": 6310441.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 293.125, "completions/mean_terminated_length": 293.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.14609850581073602, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.0482973693870008, "learning_rate": 1.9872718271006477e-05, "loss": 0.0019, "num_tokens": 6319746.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.14628297362110312, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.04629168473184109, "learning_rate": 1.9871691941094266e-05, "loss": 0.0019, "num_tokens": 6327550.0, "reward": 1.21875, "reward_std": 0.36443448066711426, "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, "rewards/fixed_code_pass_all_test_reward/std": 0.36443448066711426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 292.125, "completions/mean_terminated_length": 292.125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.1464674414314702, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06161970482207835, "learning_rate": 1.987066151661741e-05, "loss": 0.0025, "num_tokens": 6333903.0, "reward": 1.625, "reward_std": 0.36154428124427795, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.2777460217475891, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 172.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.1466519092418373, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.08092693286016583, "learning_rate": 1.98696269980033e-05, "loss": 0.0032, "num_tokens": 6338256.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1468363770522044, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.1006615785881877, "learning_rate": 1.9868588385681035e-05, "loss": 0.004, "num_tokens": 6343838.0, "reward": 1.5833333730697632, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.41547447443008423, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.14702084486257147, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.14185567665845156, "learning_rate": 1.9867545680081407e-05, "loss": 0.0057, "num_tokens": 6350681.0, "reward": 1.8813560009002686, "reward_std": 0.200545072555542, "rewards/fixed_code_pass_all_test_reward/mean": 0.8813559412956238, "rewards/fixed_code_pass_all_test_reward/std": 0.20054508745670319, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.14720531267293857, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.08218105277046561, "learning_rate": 1.986649888163691e-05, "loss": 0.0033, "num_tokens": 6356782.0, "reward": 1.454545497894287, "reward_std": 0.31114667654037476, "rewards/fixed_code_pass_all_test_reward/mean": 0.4545454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.31114673614501953, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.14738978048330567, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.09625663887709379, "learning_rate": 1.9865447990781733e-05, "loss": 0.0039, "num_tokens": 6361185.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 153.25, "completions/mean_terminated_length": 153.25, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.14757424829367274, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.10471935104578733, "learning_rate": 1.9864393007951762e-05, "loss": 0.0042, "num_tokens": 6368075.0, "reward": 1.4431817531585693, "reward_std": 0.28927096724510193, "rewards/fixed_code_pass_all_test_reward/mean": 0.4431818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.28927096724510193, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.14775871610403984, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.0715562510304153, "learning_rate": 1.986333393358458e-05, "loss": 0.0029, "num_tokens": 6371930.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.14794318391440694, "frac_reward_zero_std": 1.0, "grad_norm": 1.1796875, "kl": 0.12071825610473752, "learning_rate": 1.9862270768119474e-05, "loss": 0.0048, "num_tokens": 6382098.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.14812765172477402, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.07053597504273057, "learning_rate": 1.9861203511997417e-05, "loss": 0.0028, "num_tokens": 6387266.0, "reward": 1.581730842590332, "reward_std": 0.42003777623176575, "rewards/fixed_code_pass_all_test_reward/mean": 0.5817307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.42003777623176575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 319.75, "completions/mean_terminated_length": 319.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.14831211953514112, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.06711819255724549, "learning_rate": 1.986013216566108e-05, "loss": 0.0027, "num_tokens": 6394352.0, "reward": 1.4047619104385376, "reward_std": 0.06734351068735123, "rewards/fixed_code_pass_all_test_reward/mean": 0.4047619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.06734350323677063, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 94.75, "completions/mean_terminated_length": 94.75, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.14849658734550822, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.13540014158934355, "learning_rate": 1.9859056729554845e-05, "loss": 0.0054, "num_tokens": 6400966.0, "reward": 1.149999976158142, "reward_std": 0.3457620143890381, "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.3457620143890381, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.1486810551558753, "frac_reward_zero_std": 1.0, "grad_norm": 0.1611328125, "kl": 0.05812676367349923, "learning_rate": 1.9857977204124768e-05, "loss": 0.0023, "num_tokens": 6405423.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1488655229662424, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.05207123002037406, "learning_rate": 1.9856893589818623e-05, "loss": 0.0021, "num_tokens": 6411973.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1490499907766095, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.05990785779431462, "learning_rate": 1.9855805887085863e-05, "loss": 0.0024, "num_tokens": 6420434.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.14923445858697656, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.0719997517298907, "learning_rate": 1.9854714096377648e-05, "loss": 0.0029, "num_tokens": 6428326.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.14941892639734367, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.08311649737879634, "learning_rate": 1.9853618218146825e-05, "loss": 0.0033, "num_tokens": 6435101.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 426.5, "completions/mean_terminated_length": 426.5, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.14960339420771077, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.0223251938004978, "learning_rate": 1.985251825284794e-05, "loss": 0.0009, "num_tokens": 6442945.0, "reward": 1.9659091234207153, "reward_std": 0.047049880027770996, "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.047049909830093384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.14978786201807784, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.06880708155222237, "learning_rate": 1.985141420093724e-05, "loss": 0.0028, "num_tokens": 6448137.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.14997232982844494, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.08316550776362419, "learning_rate": 1.9850306062872662e-05, "loss": 0.0033, "num_tokens": 6456099.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.15015679763881204, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.07240581046789885, "learning_rate": 1.9849193839113833e-05, "loss": 0.0029, "num_tokens": 6462946.0, "reward": 1.7445652484893799, "reward_std": 0.11251069605350494, "rewards/fixed_code_pass_all_test_reward/mean": 0.7445651888847351, "rewards/fixed_code_pass_all_test_reward/std": 0.11251069605350494, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 352.25, "completions/mean_terminated_length": 352.25, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.1503412654491791, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.04959081788547337, "learning_rate": 1.9848077530122083e-05, "loss": 0.002, "num_tokens": 6469764.0, "reward": 1.6785714626312256, "reward_std": 0.4501376748085022, "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.3657134771347046, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 259.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.1505257332595462, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.07414586562663317, "learning_rate": 1.984695713636043e-05, "loss": 0.003, "num_tokens": 6476615.0, "reward": 1.2828947305679321, "reward_std": 0.4447624385356903, "rewards/fixed_code_pass_all_test_reward/mean": 0.28289473056793213, "rewards/fixed_code_pass_all_test_reward/std": 0.4447624385356903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1507102010699133, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.07733675139024854, "learning_rate": 1.984583265829359e-05, "loss": 0.0031, "num_tokens": 6484813.0, "reward": 1.8841463327407837, "reward_std": 0.32768362760543823, "rewards/fixed_code_pass_all_test_reward/mean": 0.8841463327407837, "rewards/fixed_code_pass_all_test_reward/std": 0.3276836574077606, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.15089466888028039, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.048105229856446385, "learning_rate": 1.9844704096387973e-05, "loss": 0.0019, "num_tokens": 6490952.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 382.125, "completions/mean_terminated_length": 382.125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.1510791366906475, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.07186682801693678, "learning_rate": 1.9843571451111684e-05, "loss": 0.0029, "num_tokens": 6498753.0, "reward": 1.0, "reward_std": 0.3977196216583252, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.11947115510702133, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 237.0, "completions/mean_terminated_length": 237.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.15126360450101456, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.10549730621278286, "learning_rate": 1.9842434722934517e-05, "loss": 0.0042, "num_tokens": 6506697.0, "reward": 1.890625, "reward_std": 0.20834393799304962, "rewards/fixed_code_pass_all_test_reward/mean": 0.890625, "rewards/fixed_code_pass_all_test_reward/std": 0.208343967795372, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.15144807231138166, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.0811154986731708, "learning_rate": 1.9841293912327963e-05, "loss": 0.0032, "num_tokens": 6514298.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 529.375, "completions/mean_terminated_length": 529.375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.15163254012174876, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.04714387049898505, "learning_rate": 1.9840149019765206e-05, "loss": 0.0019, "num_tokens": 6524909.0, "reward": 1.4583333730697632, "reward_std": 0.48997896909713745, "rewards/fixed_code_pass_all_test_reward/mean": 0.4583333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.48997893929481506, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.15181700793211583, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.09472138807177544, "learning_rate": 1.9839000045721118e-05, "loss": 0.0038, "num_tokens": 6531304.0, "reward": 1.9537036418914795, "reward_std": 0.08572408556938171, "rewards/fixed_code_pass_all_test_reward/mean": 0.9537037014961243, "rewards/fixed_code_pass_all_test_reward/std": 0.08572409301996231, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.15200147574248293, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.05834426870569587, "learning_rate": 1.9837846990672277e-05, "loss": 0.0023, "num_tokens": 6540700.0, "reward": 1.1299999952316284, "reward_std": 0.35181164741516113, "rewards/fixed_code_pass_all_test_reward/mean": 0.12999999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.35181164741516113, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.15218594355285003, "frac_reward_zero_std": 1.0, "grad_norm": 0.2001953125, "kl": 0.08281086757779121, "learning_rate": 1.983668985509694e-05, "loss": 0.0033, "num_tokens": 6546034.0, "reward": 1.0860215425491333, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.08602150529623032, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.1523704113632171, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.06237364048138261, "learning_rate": 1.9835528639475066e-05, "loss": 0.0025, "num_tokens": 6557179.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 437.625, "completions/mean_terminated_length": 437.625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.1525548791735842, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.06687330082058907, "learning_rate": 1.9834363344288295e-05, "loss": 0.0027, "num_tokens": 6569968.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 218.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1527393469839513, "frac_reward_zero_std": 1.0, "grad_norm": 0.26171875, "kl": 0.05275096604600549, "learning_rate": 1.9833193970019972e-05, "loss": 0.0021, "num_tokens": 6575129.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.15292381479431838, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.08440178632736206, "learning_rate": 1.983202051715513e-05, "loss": 0.0034, "num_tokens": 6584113.0, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.15310828260468548, "frac_reward_zero_std": 1.0, "grad_norm": 0.3203125, "kl": 0.10972986370325089, "learning_rate": 1.9830842986180486e-05, "loss": 0.0044, "num_tokens": 6592272.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.15329275041505258, "frac_reward_zero_std": 1.0, "grad_norm": 0.435546875, "kl": 0.10315335262566805, "learning_rate": 1.9829661377584456e-05, "loss": 0.0041, "num_tokens": 6601345.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.15347721822541965, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.11440882552415133, "learning_rate": 1.9828475691857148e-05, "loss": 0.0046, "num_tokens": 6608163.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 99.875, "completions/mean_terminated_length": 99.875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.15366168603578675, "frac_reward_zero_std": 0.0, "grad_norm": 3.71875, "kl": 0.12600817531347275, "learning_rate": 1.9827285929490356e-05, "loss": 0.005, "num_tokens": 6615786.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 390.625, "completions/mean_terminated_length": 390.625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.15384615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.05436290567740798, "learning_rate": 1.9826092090977574e-05, "loss": 0.0022, "num_tokens": 6629727.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.15403062165652093, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.06539074704051018, "learning_rate": 1.9824894176813975e-05, "loss": 0.0026, "num_tokens": 6634121.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.15421508946688803, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.13106460496783257, "learning_rate": 1.9823692187496424e-05, "loss": 0.0052, "num_tokens": 6643206.0, "reward": 1.7361111640930176, "reward_std": 0.36420121788978577, "rewards/fixed_code_pass_all_test_reward/mean": 0.7361111640930176, "rewards/fixed_code_pass_all_test_reward/std": 0.36420127749443054, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 178.125, "completions/mean_terminated_length": 178.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.15439955727725513, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.09188244305551052, "learning_rate": 1.982248612352349e-05, "loss": 0.0037, "num_tokens": 6665799.0, "reward": 1.1129517555236816, "reward_std": 0.2526172995567322, "rewards/fixed_code_pass_all_test_reward/mean": 0.11295180022716522, "rewards/fixed_code_pass_all_test_reward/std": 0.2526172995567322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 89.375, "completions/mean_terminated_length": 89.375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.1545840250876222, "frac_reward_zero_std": 0.0, "grad_norm": 3.75, "kl": 0.0975748561322689, "learning_rate": 1.9821275985395414e-05, "loss": 0.0039, "num_tokens": 6669394.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1547684928979893, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.11096777860075235, "learning_rate": 1.9820061773614137e-05, "loss": 0.0044, "num_tokens": 6676492.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 148.625, "completions/mean_terminated_length": 148.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.1549529607083564, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.1314147561788559, "learning_rate": 1.9818843488683294e-05, "loss": 0.0053, "num_tokens": 6685473.0, "reward": 1.8522727489471436, "reward_std": 0.1234825924038887, "rewards/fixed_code_pass_all_test_reward/mean": 0.8522727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.12348254770040512, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.15513742851872347, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.05889810808002949, "learning_rate": 1.9817621131108197e-05, "loss": 0.0024, "num_tokens": 6691032.0, "reward": 1.59375, "reward_std": 0.46170300245285034, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.46170300245285034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.15532189632909058, "frac_reward_zero_std": 0.0, "grad_norm": 4.15625, "kl": 0.10274303751066327, "learning_rate": 1.9816394701395853e-05, "loss": 0.0041, "num_tokens": 6697264.0, "reward": 1.5625, "reward_std": 0.6781013607978821, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.15550636413945768, "frac_reward_zero_std": 1.0, "grad_norm": 0.2109375, "kl": 0.09393032314255834, "learning_rate": 1.9815164200054963e-05, "loss": 0.0038, "num_tokens": 6704422.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.15569083194982475, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.11198170250281692, "learning_rate": 1.981392962759591e-05, "loss": 0.0045, "num_tokens": 6711827.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.15587529976019185, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.07248180755414069, "learning_rate": 1.9812690984530766e-05, "loss": 0.0029, "num_tokens": 6716309.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.15605976757055895, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.11459195520728827, "learning_rate": 1.9811448271373296e-05, "loss": 0.0046, "num_tokens": 6721288.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.15624423538092602, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.14778584707528353, "learning_rate": 1.981020148863895e-05, "loss": 0.0059, "num_tokens": 6728900.0, "reward": 1.8823529481887817, "reward_std": 0.3327561318874359, "rewards/fixed_code_pass_all_test_reward/mean": 0.8823529481887817, "rewards/fixed_code_pass_all_test_reward/std": 0.3327561616897583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.15642870319129312, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.10695195430889726, "learning_rate": 1.9808950636844872e-05, "loss": 0.0043, "num_tokens": 6737974.0, "reward": 0.932692289352417, "reward_std": 0.38557571172714233, "rewards/fixed_code_pass_all_test_reward/mean": 0.057692307978868484, "rewards/fixed_code_pass_all_test_reward/std": 0.08476510643959045, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 193.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.15661317100166022, "frac_reward_zero_std": 1.0, "grad_norm": 0.310546875, "kl": 0.10972925461828709, "learning_rate": 1.9807695716509876e-05, "loss": 0.0044, "num_tokens": 6743603.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 178.125, "completions/mean_terminated_length": 178.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.1567976388120273, "frac_reward_zero_std": 1.0, "grad_norm": 0.6875, "kl": 0.1108173131942749, "learning_rate": 1.9806436728154484e-05, "loss": 0.0044, "num_tokens": 6754196.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.1569821066223944, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.06828380422666669, "learning_rate": 1.9805173672300897e-05, "loss": 0.0027, "num_tokens": 6760995.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1571665744327615, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.06363218417391181, "learning_rate": 1.9803906549473002e-05, "loss": 0.0025, "num_tokens": 6767154.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.15735104224312857, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06687038019299507, "learning_rate": 1.9802635360196375e-05, "loss": 0.0027, "num_tokens": 6777884.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.15753551005349567, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.05679137515835464, "learning_rate": 1.9801360104998276e-05, "loss": 0.0023, "num_tokens": 6786084.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.15771997786386274, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.10188232036307454, "learning_rate": 1.9800080784407657e-05, "loss": 0.0041, "num_tokens": 6794221.0, "reward": 1.112804889678955, "reward_std": 0.10260274261236191, "rewards/fixed_code_pass_all_test_reward/mean": 0.11280487477779388, "rewards/fixed_code_pass_all_test_reward/std": 0.10260274261236191, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.15790444567422984, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.04763806005939841, "learning_rate": 1.9798797398955147e-05, "loss": 0.0019, "num_tokens": 6802746.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.15808891348459694, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.1282939212396741, "learning_rate": 1.9797509949173072e-05, "loss": 0.0051, "num_tokens": 6807943.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.15827338129496402, "frac_reward_zero_std": 1.0, "grad_norm": 0.1826171875, "kl": 0.11027776449918747, "learning_rate": 1.9796218435595436e-05, "loss": 0.0044, "num_tokens": 6817120.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.15845784910533112, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.09702498512342572, "learning_rate": 1.979492285875793e-05, "loss": 0.0039, "num_tokens": 6825212.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 440.75, "completions/mean_terminated_length": 440.75, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.15864231691569822, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.12024405039846897, "learning_rate": 1.9793623219197933e-05, "loss": 0.0048, "num_tokens": 6834482.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 177.5, "completions/mean_terminated_length": 177.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1588267847260653, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.1399065339937806, "learning_rate": 1.9792319517454507e-05, "loss": 0.0056, "num_tokens": 6842774.0, "reward": 1.5258620977401733, "reward_std": 0.739745020866394, "rewards/fixed_code_pass_all_test_reward/mean": 0.6508620977401733, "rewards/fixed_code_pass_all_test_reward/std": 0.4860680401325226, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.1590112525364324, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.10297096101567149, "learning_rate": 1.9791011754068395e-05, "loss": 0.0041, "num_tokens": 6851249.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1591957203467995, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.05475117522291839, "learning_rate": 1.978969992958204e-05, "loss": 0.0022, "num_tokens": 6856649.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.15938018815716656, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.20001666154712439, "learning_rate": 1.9788384044539545e-05, "loss": 0.008, "num_tokens": 6865881.0, "reward": 1.6534810066223145, "reward_std": 0.23825323581695557, "rewards/fixed_code_pass_all_test_reward/mean": 0.6534810066223145, "rewards/fixed_code_pass_all_test_reward/std": 0.23825323581695557, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 369.75, "completions/mean_terminated_length": 369.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.15956465596753366, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.07951619196683168, "learning_rate": 1.9787064099486724e-05, "loss": 0.0032, "num_tokens": 6875943.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.15974912377790076, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.09619615180417895, "learning_rate": 1.978574009497105e-05, "loss": 0.0038, "num_tokens": 6883245.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 370.875, "completions/mean_terminated_length": 370.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.15993359158826784, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.05738591309636831, "learning_rate": 1.9784412031541697e-05, "loss": 0.0023, "num_tokens": 6893564.0, "reward": 1.25, "reward_std": 0.26051297783851624, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.2605130076408386, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.16011805939863494, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.15418710745871067, "learning_rate": 1.9783079909749516e-05, "loss": 0.0062, "num_tokens": 6898643.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 119.375, "completions/mean_terminated_length": 119.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.16030252720900204, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.12715440057218075, "learning_rate": 1.9781743730147047e-05, "loss": 0.0051, "num_tokens": 6905398.0, "reward": 1.8308823108673096, "reward_std": 0.31353941559791565, "rewards/fixed_code_pass_all_test_reward/mean": 0.8308823108673096, "rewards/fixed_code_pass_all_test_reward/std": 0.31353941559791565, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 218.5, "completions/mean_terminated_length": 218.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.1604869950193691, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.11552065052092075, "learning_rate": 1.97804034932885e-05, "loss": 0.0046, "num_tokens": 6911418.0, "reward": 1.9239130020141602, "reward_std": 0.04696187749505043, "rewards/fixed_code_pass_all_test_reward/mean": 0.9239130616188049, "rewards/fixed_code_pass_all_test_reward/std": 0.04696187749505043, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 237.25, "completions/mean_terminated_length": 237.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.1606714628297362, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.14500899612903595, "learning_rate": 1.9779059199729782e-05, "loss": 0.0058, "num_tokens": 6917556.0, "reward": 1.688829779624939, "reward_std": 0.2783292829990387, "rewards/fixed_code_pass_all_test_reward/mean": 0.688829779624939, "rewards/fixed_code_pass_all_test_reward/std": 0.2783292829990387, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 105.0, "completions/mean_terminated_length": 105.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.1608559306401033, "frac_reward_zero_std": 1.0, "grad_norm": 0.373046875, "kl": 0.19173415005207062, "learning_rate": 1.9777710850028475e-05, "loss": 0.0077, "num_tokens": 6923780.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 356.375, "completions/mean_terminated_length": 356.375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.16104039845047038, "frac_reward_zero_std": 1.0, "grad_norm": 0.79296875, "kl": 0.10933925630524755, "learning_rate": 1.9776358444743845e-05, "loss": 0.0044, "num_tokens": 6932031.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.16122486626083748, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.08826982416212559, "learning_rate": 1.9775001984436842e-05, "loss": 0.0035, "num_tokens": 6939711.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 298.0, "completions/mean_terminated_length": 298.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.16140933407120459, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.08268321445211768, "learning_rate": 1.9773641469670098e-05, "loss": 0.0033, "num_tokens": 6946631.0, "reward": 1.0806450843811035, "reward_std": 0.04561978578567505, "rewards/fixed_code_pass_all_test_reward/mean": 0.08064515888690948, "rewards/fixed_code_pass_all_test_reward/std": 0.045619793236255646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.16159380188157166, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.07872053608298302, "learning_rate": 1.9772276901007924e-05, "loss": 0.0031, "num_tokens": 6955733.0, "reward": 1.7678571939468384, "reward_std": 0.25253817439079285, "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 234.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.16177826969193876, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.09283984638750553, "learning_rate": 1.977090827901631e-05, "loss": 0.0037, "num_tokens": 6961821.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.16196273750230586, "frac_reward_zero_std": 1.0, "grad_norm": 0.2373046875, "kl": 0.1555776884779334, "learning_rate": 1.9769535604262934e-05, "loss": 0.0062, "num_tokens": 6970750.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.16214720531267293, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.1563334334641695, "learning_rate": 1.976815887731715e-05, "loss": 0.0063, "num_tokens": 6977333.0, "reward": 0.8958333730697632, "reward_std": 0.895037055015564, "rewards/fixed_code_pass_all_test_reward/mean": 0.3958333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.4537104368209839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.16233167312304003, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.1701475726440549, "learning_rate": 1.9766778098749996e-05, "loss": 0.0068, "num_tokens": 6985576.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 131.375, "completions/mean_terminated_length": 131.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.16251614093340713, "frac_reward_zero_std": 0.0, "grad_norm": 3.828125, "kl": 0.137057987973094, "learning_rate": 1.976539326913419e-05, "loss": 0.0055, "num_tokens": 6991723.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1627006087437742, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "kl": 0.11625861003994942, "learning_rate": 1.976400438904413e-05, "loss": 0.0047, "num_tokens": 6997384.0, "reward": 1.5625, "reward_std": 0.2289411872625351, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.22894108295440674, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 300.375, "completions/mean_terminated_length": 300.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1628850765541413, "frac_reward_zero_std": 1.0, "grad_norm": 0.203125, "kl": 0.07764099817723036, "learning_rate": 1.976261145905589e-05, "loss": 0.0031, "num_tokens": 7003211.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1630695443645084, "frac_reward_zero_std": 0.0, "grad_norm": 3.5, "kl": 0.200009873136878, "learning_rate": 1.9761214479747228e-05, "loss": 0.008, "num_tokens": 7007010.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 167.875, "completions/mean_terminated_length": 167.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.16325401217487548, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.1430496433749795, "learning_rate": 1.9759813451697584e-05, "loss": 0.0057, "num_tokens": 7013121.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.16343847998524258, "frac_reward_zero_std": 1.0, "grad_norm": 0.2734375, "kl": 0.09621597826480865, "learning_rate": 1.975840837548807e-05, "loss": 0.0038, "num_tokens": 7019634.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.16362294779560965, "frac_reward_zero_std": 1.0, "grad_norm": 0.353515625, "kl": 0.10708995256572962, "learning_rate": 1.9756999251701486e-05, "loss": 0.0043, "num_tokens": 7029657.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 212.125, "completions/mean_terminated_length": 212.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.16380741560597675, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.11096573481336236, "learning_rate": 1.97555860809223e-05, "loss": 0.0044, "num_tokens": 7039850.0, "reward": 1.4305555820465088, "reward_std": 0.41334646940231323, "rewards/fixed_code_pass_all_test_reward/mean": 0.4305555522441864, "rewards/fixed_code_pass_all_test_reward/std": 0.4133465588092804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.16399188341634385, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.12716420739889145, "learning_rate": 1.9754168863736675e-05, "loss": 0.0051, "num_tokens": 7047934.0, "reward": 1.3636363744735718, "reward_std": 0.2571297585964203, "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.16417635122671093, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.06372693879529834, "learning_rate": 1.975274760073243e-05, "loss": 0.0025, "num_tokens": 7057122.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 362.25, "completions/mean_terminated_length": 362.25, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.16436081903707803, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.05681471573188901, "learning_rate": 1.975132229249908e-05, "loss": 0.0023, "num_tokens": 7068132.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 332.25, "completions/mean_terminated_length": 332.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.16454528684744513, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.04818777577020228, "learning_rate": 1.9749892939627814e-05, "loss": 0.0019, "num_tokens": 7075830.0, "reward": 1.923076868057251, "reward_std": 0.0411171093583107, "rewards/fixed_code_pass_all_test_reward/mean": 0.9230769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.041117113083601, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.1647297546578122, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.10812587849795818, "learning_rate": 1.9748459542711492e-05, "loss": 0.0043, "num_tokens": 7083125.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.1649142224681793, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.10130429733544588, "learning_rate": 1.9747022102344663e-05, "loss": 0.0041, "num_tokens": 7089474.0, "reward": 1.1531250476837158, "reward_std": 0.20460651814937592, "rewards/fixed_code_pass_all_test_reward/mean": 0.15312500298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.2046065628528595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1650986902785464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.05821435945108533, "learning_rate": 1.9745580619123535e-05, "loss": 0.0023, "num_tokens": 7096544.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 585.75, "completions/mean_terminated_length": 585.75, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.16528315808891347, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.05744888773187995, "learning_rate": 1.9744135093646016e-05, "loss": 0.0023, "num_tokens": 7112702.0, "reward": 1.34375, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.16546762589928057, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.08735818136483431, "learning_rate": 1.974268552651167e-05, "loss": 0.0035, "num_tokens": 7120661.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 364.0, "completions/mean_terminated_length": 364.0, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.16565209370964767, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.05464829970151186, "learning_rate": 1.9741231918321752e-05, "loss": 0.0022, "num_tokens": 7129253.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.16583656152001475, "frac_reward_zero_std": 1.0, "grad_norm": 0.23828125, "kl": 0.07434893515892327, "learning_rate": 1.9739774269679186e-05, "loss": 0.003, "num_tokens": 7137392.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 850.375, "completions/mean_terminated_length": 850.375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.16602102933038185, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.03254843386821449, "learning_rate": 1.973831258118857e-05, "loss": 0.0013, "num_tokens": 7155283.0, "reward": 1.0462963581085205, "reward_std": 0.08572408556938171, "rewards/fixed_code_pass_all_test_reward/mean": 0.046296294778585434, "rewards/fixed_code_pass_all_test_reward/std": 0.08572407811880112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.16620549714074895, "frac_reward_zero_std": 1.0, "grad_norm": 0.1708984375, "kl": 0.06579160725232214, "learning_rate": 1.9736846853456184e-05, "loss": 0.0026, "num_tokens": 7160546.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.16638996495111602, "frac_reward_zero_std": 1.0, "grad_norm": 0.2119140625, "kl": 0.07186678983271122, "learning_rate": 1.9735377087089977e-05, "loss": 0.0029, "num_tokens": 7164768.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 263.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.16657443276148312, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.05913881375454366, "learning_rate": 1.9733903282699583e-05, "loss": 0.0024, "num_tokens": 7170830.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.16675890057185022, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.0636452422477305, "learning_rate": 1.9732425440896298e-05, "loss": 0.0025, "num_tokens": 7180634.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 430.75, "completions/mean_terminated_length": 430.75, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.1669433683822173, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.05850100051611662, "learning_rate": 1.97309435622931e-05, "loss": 0.0023, "num_tokens": 7189984.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.1671278361925844, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.06530113541521132, "learning_rate": 1.972945764750464e-05, "loss": 0.0026, "num_tokens": 7197340.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.1673123040029515, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.10376751748844981, "learning_rate": 1.9727967697147246e-05, "loss": 0.0041, "num_tokens": 7204710.0, "reward": 1.9966216087341309, "reward_std": 0.009555509313941002, "rewards/fixed_code_pass_all_test_reward/mean": 0.9966216087341309, "rewards/fixed_code_pass_all_test_reward/std": 0.009555491618812084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 430.625, "completions/mean_terminated_length": 430.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.16749677181331857, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.04498612892348319, "learning_rate": 1.9726473711838914e-05, "loss": 0.0018, "num_tokens": 7213499.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.16768123962368567, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.053325997898355126, "learning_rate": 1.972497569219932e-05, "loss": 0.0021, "num_tokens": 7220299.0, "reward": 0.8958333134651184, "reward_std": 0.5034602284431458, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 573.875, "completions/mean_terminated_length": 363.2857360839844, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.16786570743405277, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.09947013488272205, "learning_rate": 1.9723473638849807e-05, "loss": 0.004, "num_tokens": 7227730.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 202.375, "completions/mean_terminated_length": 202.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.16805017524441984, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.09031926328316331, "learning_rate": 1.9721967552413396e-05, "loss": 0.0036, "num_tokens": 7234869.0, "reward": 1.6182432174682617, "reward_std": 0.3563463091850281, "rewards/fixed_code_pass_all_test_reward/mean": 0.6182432174682617, "rewards/fixed_code_pass_all_test_reward/std": 0.3563463091850281, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.16823464305478694, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.0652376830112189, "learning_rate": 1.972045743351478e-05, "loss": 0.0026, "num_tokens": 7242578.0, "reward": 1.6875, "reward_std": 0.33514389395713806, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.33514389395713806, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.16841911086515404, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.10850547254085541, "learning_rate": 1.9718943282780323e-05, "loss": 0.0043, "num_tokens": 7250930.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.16860357867552112, "frac_reward_zero_std": 0.0, "grad_norm": 3.28125, "kl": 0.0788644920103252, "learning_rate": 1.971742510083806e-05, "loss": 0.0032, "num_tokens": 7259093.0, "reward": 1.3499999046325684, "reward_std": 0.6480740308761597, "rewards/fixed_code_pass_all_test_reward/mean": 0.4750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.39910614490509033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 293.625, "completions/mean_terminated_length": 293.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.16878804648588822, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.10785340843722224, "learning_rate": 1.9715902888317703e-05, "loss": 0.0043, "num_tokens": 7266362.0, "reward": 1.5899999141693115, "reward_std": 0.24819344282150269, "rewards/fixed_code_pass_all_test_reward/mean": 0.5900000333786011, "rewards/fixed_code_pass_all_test_reward/std": 0.24819347262382507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.16897251429625532, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.07697090972214937, "learning_rate": 1.9714376645850635e-05, "loss": 0.0031, "num_tokens": 7271585.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 340.125, "completions/mean_terminated_length": 340.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.1691569821066224, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.05403205961920321, "learning_rate": 1.97128463740699e-05, "loss": 0.0022, "num_tokens": 7281690.0, "reward": 1.2613637447357178, "reward_std": 0.6097157597541809, "rewards/fixed_code_pass_all_test_reward/mean": 0.511363685131073, "rewards/fixed_code_pass_all_test_reward/std": 0.3146839439868927, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.1693414499169895, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.08242123294621706, "learning_rate": 1.9711312073610228e-05, "loss": 0.0033, "num_tokens": 7289430.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.1695259177273566, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.05527952592819929, "learning_rate": 1.9709773745108014e-05, "loss": 0.0022, "num_tokens": 7294123.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 871.375, "completions/mean_terminated_length": 871.375, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 0.16971038553772366, "frac_reward_zero_std": 1.0, "grad_norm": 0.042236328125, "kl": 0.03795198444277048, "learning_rate": 1.9708231389201323e-05, "loss": 0.0015, "num_tokens": 7313774.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 114.25, "completions/mean_terminated_length": 114.25, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.16989485334809076, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.058263259241357446, "learning_rate": 1.9706685006529887e-05, "loss": 0.0023, "num_tokens": 7321848.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.17007932115845784, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.12449900433421135, "learning_rate": 1.9705134597735113e-05, "loss": 0.005, "num_tokens": 7330424.0, "reward": 1.9449999332427979, "reward_std": 0.10184022039175034, "rewards/fixed_code_pass_all_test_reward/mean": 0.9449999928474426, "rewards/fixed_code_pass_all_test_reward/std": 0.10184022039175034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.17026378896882494, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.04106311989016831, "learning_rate": 1.9703580163460084e-05, "loss": 0.0016, "num_tokens": 7334757.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 506.75, "completions/mean_terminated_length": 286.5714416503906, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.17044825677919204, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.06428861629683524, "learning_rate": 1.970202170434954e-05, "loss": 0.0026, "num_tokens": 7341819.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1706327245895591, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.038407051004469395, "learning_rate": 1.9700459221049897e-05, "loss": 0.0015, "num_tokens": 7348236.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1708171923999262, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06990534765645862, "learning_rate": 1.969889271420924e-05, "loss": 0.0028, "num_tokens": 7353830.0, "reward": 1.751838207244873, "reward_std": 0.1446092575788498, "rewards/fixed_code_pass_all_test_reward/mean": 0.7518382668495178, "rewards/fixed_code_pass_all_test_reward/std": 0.14460931718349457, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.1710016602102933, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.06991973123513162, "learning_rate": 1.9697322184477317e-05, "loss": 0.0028, "num_tokens": 7362122.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.17118612802066038, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.09982729004696012, "learning_rate": 1.969574763250556e-05, "loss": 0.004, "num_tokens": 7371228.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 315.375, "completions/mean_terminated_length": 315.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.17137059583102748, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.05986392218619585, "learning_rate": 1.969416905894705e-05, "loss": 0.0024, "num_tokens": 7378559.0, "reward": 1.125, "reward_std": 0.13363061845302582, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.13363061845302582, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 260.0, "completions/mean_terminated_length": 260.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.17155506364139458, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.0681958394125104, "learning_rate": 1.9692586464456548e-05, "loss": 0.0027, "num_tokens": 7388391.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.17173953145176166, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.09671210404485464, "learning_rate": 1.9690999849690485e-05, "loss": 0.0039, "num_tokens": 7395181.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.17192399926212876, "frac_reward_zero_std": 1.0, "grad_norm": 0.26953125, "kl": 0.1204031347297132, "learning_rate": 1.968940921530695e-05, "loss": 0.0048, "num_tokens": 7399352.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.17210846707249586, "frac_reward_zero_std": 1.0, "grad_norm": 0.275390625, "kl": 0.1205824832431972, "learning_rate": 1.96878145619657e-05, "loss": 0.0048, "num_tokens": 7405118.0, "reward": 1.308823585510254, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.30882352590560913, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 307.75, "completions/mean_terminated_length": 307.75, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.17229293488286293, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.08492859872058034, "learning_rate": 1.968621589032817e-05, "loss": 0.0034, "num_tokens": 7416700.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.17247740269323003, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.11165731539949775, "learning_rate": 1.9684613201057453e-05, "loss": 0.0045, "num_tokens": 7424193.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.17266187050359713, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.12523596873506904, "learning_rate": 1.9683006494818305e-05, "loss": 0.005, "num_tokens": 7433205.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1728463383139642, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.08327980013564229, "learning_rate": 1.9681395772277157e-05, "loss": 0.0033, "num_tokens": 7441681.0, "reward": 0.9527026414871216, "reward_std": 0.5880208611488342, "rewards/fixed_code_pass_all_test_reward/mean": 0.20270270109176636, "rewards/fixed_code_pass_all_test_reward/std": 0.12511081993579865, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 350.25, "completions/mean_terminated_length": 350.25, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.1730308061243313, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.057259353809058666, "learning_rate": 1.96797810341021e-05, "loss": 0.0023, "num_tokens": 7453435.0, "reward": 1.625, "reward_std": 0.25253811478614807, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 183.75, "completions/mean_terminated_length": 183.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.1732152739346984, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.11962222261354327, "learning_rate": 1.9678162280962895e-05, "loss": 0.0048, "num_tokens": 7461457.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.17339974174506548, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.0923931933939457, "learning_rate": 1.9676539513530967e-05, "loss": 0.0037, "num_tokens": 7471418.0, "reward": 1.8095238208770752, "reward_std": 0.33381888270378113, "rewards/fixed_code_pass_all_test_reward/mean": 0.8095238208770752, "rewards/fixed_code_pass_all_test_reward/std": 0.33381888270378113, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.17358420955543258, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.05690016457810998, "learning_rate": 1.96749127324794e-05, "loss": 0.0023, "num_tokens": 7476064.0, "reward": 1.5, "reward_std": 0.4140393137931824, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.41403937339782715, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.17376867736579968, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.09683234989643097, "learning_rate": 1.9673281938482952e-05, "loss": 0.0039, "num_tokens": 7486145.0, "reward": 0.7708333134651184, "reward_std": 0.4792313575744629, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.17395314517616675, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.09133691946044564, "learning_rate": 1.967164713221804e-05, "loss": 0.0037, "num_tokens": 7492588.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 245.875, "completions/mean_terminated_length": 245.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.17413761298653385, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.09695507073774934, "learning_rate": 1.9670008314362744e-05, "loss": 0.0039, "num_tokens": 7501491.0, "reward": 1.58152174949646, "reward_std": 0.27294766902923584, "rewards/fixed_code_pass_all_test_reward/mean": 0.58152174949646, "rewards/fixed_code_pass_all_test_reward/std": 0.27294763922691345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 356.125, "completions/mean_terminated_length": 356.125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.17432208079690095, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.06052191276103258, "learning_rate": 1.9668365485596813e-05, "loss": 0.0024, "num_tokens": 7509892.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 283.125, "completions/mean_terminated_length": 283.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.17450654860726802, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.0539945054333657, "learning_rate": 1.9666718646601654e-05, "loss": 0.0022, "num_tokens": 7516533.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.17469101641763513, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.14024799689650536, "learning_rate": 1.9665067798060344e-05, "loss": 0.0056, "num_tokens": 7521391.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 451.25, "completions/mean_terminated_length": 451.25, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.17487548422800223, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.05632214853540063, "learning_rate": 1.9663412940657615e-05, "loss": 0.0023, "num_tokens": 7533601.0, "reward": 1.6597223281860352, "reward_std": 0.012858672998845577, "rewards/fixed_code_pass_all_test_reward/mean": 0.6597222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.01285861898213625, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.1750599520383693, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.1116139916703105, "learning_rate": 1.9661754075079872e-05, "loss": 0.0045, "num_tokens": 7538936.0, "reward": 1.2000000476837158, "reward_std": 0.35913729667663574, "rewards/fixed_code_pass_all_test_reward/mean": 0.32499998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.1313198357820511, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.1752444198487364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.06465703574940562, "learning_rate": 1.966009120201517e-05, "loss": 0.0026, "num_tokens": 7545514.0, "reward": 1.8518519401550293, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8518518805503845, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1754288876591035, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.07237509125843644, "learning_rate": 1.9658424322153237e-05, "loss": 0.0029, "num_tokens": 7551723.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.17561335546947057, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.059821920935064554, "learning_rate": 1.965675343618546e-05, "loss": 0.0024, "num_tokens": 7560948.0, "reward": 1.5813252925872803, "reward_std": 0.3454131782054901, "rewards/fixed_code_pass_all_test_reward/mean": 0.7063252925872803, "rewards/fixed_code_pass_all_test_reward/std": 0.07699471712112427, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.17579782327983767, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.08965201862156391, "learning_rate": 1.965507854480488e-05, "loss": 0.0036, "num_tokens": 7566570.0, "reward": 1.5113636255264282, "reward_std": 0.686241090297699, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.4051032066345215, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.17598229109020475, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.18948283046483994, "learning_rate": 1.965339964870621e-05, "loss": 0.0076, "num_tokens": 7570964.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.17616675890057185, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.04477545106783509, "learning_rate": 1.9651716748585822e-05, "loss": 0.0018, "num_tokens": 7579826.0, "reward": 1.6363636255264282, "reward_std": 0.2571297585964203, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.17635122671093895, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.09068109095096588, "learning_rate": 1.9650029845141746e-05, "loss": 0.0036, "num_tokens": 7584125.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.17653569452130602, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.11115513741970062, "learning_rate": 1.964833893907367e-05, "loss": 0.0044, "num_tokens": 7592276.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.17672016233167312, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.1512646609917283, "learning_rate": 1.9646644031082948e-05, "loss": 0.0061, "num_tokens": 7599976.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 314.375, "completions/mean_terminated_length": 314.375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.17690463014204022, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.06098993611522019, "learning_rate": 1.9644945121872586e-05, "loss": 0.0024, "num_tokens": 7607595.0, "reward": 0.3203125, "reward_std": 0.593338668346405, "rewards/fixed_code_pass_all_test_reward/mean": 0.0703125, "rewards/fixed_code_pass_all_test_reward/std": 0.13126063346862793, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 297.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1770890979524073, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.05940225487574935, "learning_rate": 1.9643242212147266e-05, "loss": 0.0024, "num_tokens": 7613103.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 236.375, "completions/mean_terminated_length": 236.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.1772735657627744, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.058653419837355614, "learning_rate": 1.9641535302613305e-05, "loss": 0.0023, "num_tokens": 7621074.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.1774580335731415, "frac_reward_zero_std": 1.0, "grad_norm": 0.54296875, "kl": 0.09793097386136651, "learning_rate": 1.96398243939787e-05, "loss": 0.0039, "num_tokens": 7625488.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 229.875, "completions/mean_terminated_length": 229.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.17764250138350857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.04710475169122219, "learning_rate": 1.96381094869531e-05, "loss": 0.0019, "num_tokens": 7634127.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.17782696919387567, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.06147215189412236, "learning_rate": 1.9636390582247805e-05, "loss": 0.0025, "num_tokens": 7642163.0, "reward": 1.091071367263794, "reward_std": 0.2575888931751251, "rewards/fixed_code_pass_all_test_reward/mean": 0.09107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.2575888931751251, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.17801143700424277, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.057722190860658884, "learning_rate": 1.9634667680575784e-05, "loss": 0.0023, "num_tokens": 7648807.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.17819590481460984, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.07376265153288841, "learning_rate": 1.9632940782651663e-05, "loss": 0.003, "num_tokens": 7654768.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.17838037262497694, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.037219332763925195, "learning_rate": 1.9631209889191712e-05, "loss": 0.0015, "num_tokens": 7661162.0, "reward": 1.14945650100708, "reward_std": 0.2868618071079254, "rewards/fixed_code_pass_all_test_reward/mean": 0.14945653080940247, "rewards/fixed_code_pass_all_test_reward/std": 0.2868618071079254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.17856484043534404, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.08300414541736245, "learning_rate": 1.9629475000913878e-05, "loss": 0.0033, "num_tokens": 7674215.0, "reward": 1.2000000476837158, "reward_std": 0.6414269804954529, "rewards/fixed_code_pass_all_test_reward/mean": 0.32499998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.439967542886734, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 280.25, "completions/mean_terminated_length": 280.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.1787493082457111, "frac_reward_zero_std": 1.0, "grad_norm": 0.48828125, "kl": 0.10370682389475405, "learning_rate": 1.9627736118537748e-05, "loss": 0.0041, "num_tokens": 7680953.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 463.875, "completions/mean_terminated_length": 463.875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.17893377605607821, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.03747949656099081, "learning_rate": 1.962599324278458e-05, "loss": 0.0015, "num_tokens": 7690832.0, "reward": 1.2857142686843872, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 332.75, "completions/mean_terminated_length": 332.75, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.17911824386644531, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.08319871686398983, "learning_rate": 1.962424637437727e-05, "loss": 0.0033, "num_tokens": 7698126.0, "reward": 1.18359375, "reward_std": 0.1048988401889801, "rewards/fixed_code_pass_all_test_reward/mean": 0.18359375, "rewards/fixed_code_pass_all_test_reward/std": 0.1048988401889801, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 544.75, "completions/mean_terminated_length": 544.75, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.1793027116768124, "frac_reward_zero_std": 0.0, "grad_norm": 0.86328125, "kl": 0.03083840850740671, "learning_rate": 1.9622495514040396e-05, "loss": 0.0012, "num_tokens": 7711604.0, "reward": 1.9876543283462524, "reward_std": 0.022859742864966393, "rewards/fixed_code_pass_all_test_reward/mean": 0.9876543283462524, "rewards/fixed_code_pass_all_test_reward/std": 0.022859742864966393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.1794871794871795, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.06797039601951838, "learning_rate": 1.9620740662500165e-05, "loss": 0.0027, "num_tokens": 7715976.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.1796716472975466, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.09643805678933859, "learning_rate": 1.961898182048446e-05, "loss": 0.0039, "num_tokens": 7721867.0, "reward": 1.5030487775802612, "reward_std": 0.09485577046871185, "rewards/fixed_code_pass_all_test_reward/mean": 0.5030487775802612, "rewards/fixed_code_pass_all_test_reward/std": 0.09485579282045364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.17985611510791366, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.10062839277088642, "learning_rate": 1.9617218988722804e-05, "loss": 0.004, "num_tokens": 7727471.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 242.25, "completions/mean_terminated_length": 242.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.18004058291828076, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.04779189848341048, "learning_rate": 1.9615452167946383e-05, "loss": 0.0019, "num_tokens": 7733001.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.18022505072864786, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.07286589639261365, "learning_rate": 1.9613681358888042e-05, "loss": 0.0029, "num_tokens": 7742574.0, "reward": 1.5535714626312256, "reward_std": 0.6305511593818665, "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.28056585788726807, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 451.75, "completions/mean_terminated_length": 451.75, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.18040951853901493, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.05102261737920344, "learning_rate": 1.9611906562282264e-05, "loss": 0.002, "num_tokens": 7750148.0, "reward": 1.0416666269302368, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.18059398634938204, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.06509739300236106, "learning_rate": 1.9610127778865204e-05, "loss": 0.0026, "num_tokens": 7756813.0, "reward": 1.625, "reward_std": 0.25253811478614807, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1259.875, "completions/mean_terminated_length": 997.1666870117188, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.18077845415974914, "frac_reward_zero_std": 0.0, "grad_norm": 0.6015625, "kl": 0.06456740805879235, "learning_rate": 1.9608345009374666e-05, "loss": 0.0026, "num_tokens": 7769684.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 407.75, "completions/mean_terminated_length": 407.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.1809629219701162, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.07645057188346982, "learning_rate": 1.9606558254550095e-05, "loss": 0.0031, "num_tokens": 7776506.0, "reward": 1.9500000476837158, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.1811473897804833, "frac_reward_zero_std": 0.0, "grad_norm": 3.859375, "kl": 0.12761488277465105, "learning_rate": 1.96047675151326e-05, "loss": 0.0051, "num_tokens": 7783866.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.1813318575908504, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.10931914020329714, "learning_rate": 1.9602972791864948e-05, "loss": 0.0044, "num_tokens": 7790604.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.18151632540121748, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.12077313847839832, "learning_rate": 1.9601174085491547e-05, "loss": 0.0048, "num_tokens": 7795337.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.18170079321158458, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.12423477694392204, "learning_rate": 1.9599371396758457e-05, "loss": 0.005, "num_tokens": 7803713.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 309.625, "completions/mean_terminated_length": 309.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.18188526102195168, "frac_reward_zero_std": 0.0, "grad_norm": 12.9375, "kl": 0.09902591165155172, "learning_rate": 1.9597564726413404e-05, "loss": 0.004, "num_tokens": 7814790.0, "reward": 0.9605262875556946, "reward_std": 0.39057958126068115, "rewards/fixed_code_pass_all_test_reward/mean": 0.08552631735801697, "rewards/fixed_code_pass_all_test_reward/std": 0.055824220180511475, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 344.125, "completions/mean_terminated_length": 344.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.18206972883231876, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.08214648067951202, "learning_rate": 1.959575407520575e-05, "loss": 0.0033, "num_tokens": 7822079.0, "reward": 1.5625, "reward_std": 0.7426556348800659, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.35456061363220215, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 394.25, "completions/mean_terminated_length": 394.25, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.18225419664268586, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.05969202215783298, "learning_rate": 1.9593939443886514e-05, "loss": 0.0024, "num_tokens": 7830465.0, "reward": 1.0833333730697632, "reward_std": 0.9229289889335632, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4457988739013672, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.18243866445305293, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.09367978386580944, "learning_rate": 1.9592120833208366e-05, "loss": 0.0037, "num_tokens": 7839222.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.18262313226342003, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.08172730123624206, "learning_rate": 1.959029824392563e-05, "loss": 0.0033, "num_tokens": 7843832.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.18280760007378713, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.09926654398441315, "learning_rate": 1.958847167679427e-05, "loss": 0.004, "num_tokens": 7853502.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 257.125, "completions/mean_terminated_length": 257.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.1829920678841542, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.05530236545018852, "learning_rate": 1.9586641132571914e-05, "loss": 0.0022, "num_tokens": 7858791.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 762.25, "completions/mean_terminated_length": 762.25, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.1831765356945213, "frac_reward_zero_std": 0.0, "grad_norm": 0.640625, "kl": 0.024339510593563318, "learning_rate": 1.9584806612017827e-05, "loss": 0.001, "num_tokens": 7873545.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.1833610035048884, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.07237664004787803, "learning_rate": 1.958296811589293e-05, "loss": 0.0029, "num_tokens": 7881452.0, "reward": 1.25, "reward_std": 0.10101527720689774, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 246.875, "completions/mean_terminated_length": 246.875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.18354547131525548, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.06600492959842086, "learning_rate": 1.9581125644959793e-05, "loss": 0.0026, "num_tokens": 7890059.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 326.375, "completions/mean_terminated_length": 326.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.18372993912562258, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.03241133620031178, "learning_rate": 1.9579279199982632e-05, "loss": 0.0013, "num_tokens": 7902262.0, "reward": 1.6126031875610352, "reward_std": 0.020453505218029022, "rewards/fixed_code_pass_all_test_reward/mean": 0.6126033067703247, "rewards/fixed_code_pass_all_test_reward/std": 0.02045350708067417, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 395.875, "completions/mean_terminated_length": 395.875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.18391440693598968, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.066024114144966, "learning_rate": 1.9577428781727313e-05, "loss": 0.0026, "num_tokens": 7915477.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.18409887474635675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.051661615492776036, "learning_rate": 1.9575574390961343e-05, "loss": 0.0021, "num_tokens": 7921957.0, "reward": 1.2222222089767456, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2222222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 962.875, "completions/mean_terminated_length": 962.875, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.18428334255672385, "frac_reward_zero_std": 0.0, "grad_norm": 0.57421875, "kl": 0.022169530275277793, "learning_rate": 1.9573716028453896e-05, "loss": 0.0009, "num_tokens": 7939684.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 381.125, "completions/mean_terminated_length": 381.125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.18446781036709095, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0476774163544178, "learning_rate": 1.957185369497577e-05, "loss": 0.0019, "num_tokens": 7948101.0, "reward": 1.757211446762085, "reward_std": 0.14278119802474976, "rewards/fixed_code_pass_all_test_reward/mean": 0.7572115063667297, "rewards/fixed_code_pass_all_test_reward/std": 0.14278118312358856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.18465227817745802, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.10511107463389635, "learning_rate": 1.9569987391299428e-05, "loss": 0.0042, "num_tokens": 7956267.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 203.5, "completions/mean_terminated_length": 203.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.18483674598782512, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.06870237970724702, "learning_rate": 1.9568117118198965e-05, "loss": 0.0027, "num_tokens": 7963687.0, "reward": 1.8409091234207153, "reward_std": 0.3401506841182709, "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.021041374653577805, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 235.875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.18502121379819222, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.10595501773059368, "learning_rate": 1.9566242876450138e-05, "loss": 0.0042, "num_tokens": 7973278.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 131.625, "completions/mean_terminated_length": 131.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.1852056816085593, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.053871927317231894, "learning_rate": 1.9564364666830337e-05, "loss": 0.0022, "num_tokens": 7979915.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1853901494189264, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.06002357229590416, "learning_rate": 1.95624824901186e-05, "loss": 0.0024, "num_tokens": 7985748.0, "reward": 1.6881721019744873, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6881720423698425, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 265.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.1855746172292935, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.05871426360681653, "learning_rate": 1.9560596347095622e-05, "loss": 0.0023, "num_tokens": 7991128.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.18575908503966057, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.06638115318492055, "learning_rate": 1.955870623854373e-05, "loss": 0.0027, "num_tokens": 8002402.0, "reward": 1.648876428604126, "reward_std": 0.4881686866283417, "rewards/fixed_code_pass_all_test_reward/mean": 0.773876428604126, "rewards/fixed_code_pass_all_test_reward/std": 0.4218001961708069, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 483.5, "completions/mean_terminated_length": 483.5, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.18594355285002767, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.034278653212822974, "learning_rate": 1.9556812165246894e-05, "loss": 0.0014, "num_tokens": 8021542.0, "reward": 1.5282257795333862, "reward_std": 0.5079821944236755, "rewards/fixed_code_pass_all_test_reward/mean": 0.5282257795333862, "rewards/fixed_code_pass_all_test_reward/std": 0.5079822540283203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.18612802066039477, "frac_reward_zero_std": 1.0, "grad_norm": 0.3515625, "kl": 0.0860439371317625, "learning_rate": 1.9554914127990744e-05, "loss": 0.0034, "num_tokens": 8032582.0, "reward": 1.5365853309631348, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5365853905677795, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.18631248847076184, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.09122961387038231, "learning_rate": 1.955301212756254e-05, "loss": 0.0036, "num_tokens": 8036478.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.18649695628112894, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.14435353968292475, "learning_rate": 1.955110616475119e-05, "loss": 0.0058, "num_tokens": 8044674.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.18668142409149605, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.14601379912346601, "learning_rate": 1.954919624034725e-05, "loss": 0.0058, "num_tokens": 8052109.0, "reward": 1.6805555820465088, "reward_std": 0.7063269019126892, "rewards/fixed_code_pass_all_test_reward/mean": 0.8055555820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.37912923097610474, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.18686589190186312, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.04624775692354888, "learning_rate": 1.9547282355142913e-05, "loss": 0.0018, "num_tokens": 8061061.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.18705035971223022, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.10244118515402079, "learning_rate": 1.9545364509932017e-05, "loss": 0.0041, "num_tokens": 8068495.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 460.625, "completions/mean_terminated_length": 460.625, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.18723482752259732, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.05922526679933071, "learning_rate": 1.9543442705510047e-05, "loss": 0.0024, "num_tokens": 8078108.0, "reward": 1.5474138259887695, "reward_std": 0.7315913438796997, "rewards/fixed_code_pass_all_test_reward/mean": 0.6724138259887695, "rewards/fixed_code_pass_all_test_reward/std": 0.46702003479003906, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 99.5, "completions/mean_terminated_length": 99.5, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.1874192953329644, "frac_reward_zero_std": 1.0, "grad_norm": 0.11572265625, "kl": 0.05743929021991789, "learning_rate": 1.954151694267412e-05, "loss": 0.0023, "num_tokens": 8081640.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1876037631433315, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.06625834386795759, "learning_rate": 1.9539587222223003e-05, "loss": 0.0027, "num_tokens": 8085970.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 263.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.1877882309536986, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.046329512377269566, "learning_rate": 1.95376535449571e-05, "loss": 0.0019, "num_tokens": 8095344.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 58.375, "completions/mean_terminated_length": 58.375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.18797269876406567, "frac_reward_zero_std": 0.0, "grad_norm": 3.953125, "kl": 0.12067966721951962, "learning_rate": 1.9535715911678466e-05, "loss": 0.0048, "num_tokens": 8100451.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 291.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.18815716657443277, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.07876395899802446, "learning_rate": 1.9533774323190783e-05, "loss": 0.0032, "num_tokens": 8106029.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.18834163438479984, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.04258445277810097, "learning_rate": 1.9531828780299384e-05, "loss": 0.0017, "num_tokens": 8111850.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.18852610219516694, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.1368177398107946, "learning_rate": 1.952987928381123e-05, "loss": 0.0055, "num_tokens": 8122978.0, "reward": 1.170454502105713, "reward_std": 0.14322562515735626, "rewards/fixed_code_pass_all_test_reward/mean": 0.17045454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.14322561025619507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.18871057000553404, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.07904210221022367, "learning_rate": 1.952792583453494e-05, "loss": 0.0032, "num_tokens": 8131852.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.1888950378159011, "frac_reward_zero_std": 0.0, "grad_norm": 6.0, "kl": 0.17807579971849918, "learning_rate": 1.9525968433280756e-05, "loss": 0.0071, "num_tokens": 8136155.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 734.375, "completions/mean_terminated_length": 734.375, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.1890795056262682, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.016462975763715804, "learning_rate": 1.952400708086057e-05, "loss": 0.0007, "num_tokens": 8150766.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.1892639734366353, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.06845093285664916, "learning_rate": 1.952204177808791e-05, "loss": 0.0027, "num_tokens": 8157891.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.18944844124700239, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.0465018218383193, "learning_rate": 1.9520072525777934e-05, "loss": 0.0019, "num_tokens": 8164826.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.1896329090573695, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.08247572090476751, "learning_rate": 1.951809932474745e-05, "loss": 0.0033, "num_tokens": 8171054.0, "reward": 1.3333332538604736, "reward_std": 0.05611032992601395, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.056110311299562454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1898173768677366, "frac_reward_zero_std": 1.0, "grad_norm": 1.0625, "kl": 0.2733142767101526, "learning_rate": 1.9516122175814896e-05, "loss": 0.0109, "num_tokens": 8179202.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.19000184467810366, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.14994349144399166, "learning_rate": 1.951414107980036e-05, "loss": 0.006, "num_tokens": 8183568.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.19018631248847076, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.07327735750004649, "learning_rate": 1.9512156037525547e-05, "loss": 0.0029, "num_tokens": 8191375.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.19037078029883786, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.14160102512687445, "learning_rate": 1.9510167049813813e-05, "loss": 0.0057, "num_tokens": 8195164.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.19055524810920493, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.07681549107655883, "learning_rate": 1.9508174117490147e-05, "loss": 0.0031, "num_tokens": 8199147.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.19073971591957203, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.1043839487247169, "learning_rate": 1.9506177241381178e-05, "loss": 0.0042, "num_tokens": 8204603.0, "reward": 1.975000023841858, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.19092418372993913, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.11814405722543597, "learning_rate": 1.9504176422315164e-05, "loss": 0.0047, "num_tokens": 8212831.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 126.75, "completions/mean_terminated_length": 126.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1911086515403062, "frac_reward_zero_std": 0.0, "grad_norm": 4.0625, "kl": 0.0716338565107435, "learning_rate": 1.9502171661122e-05, "loss": 0.0029, "num_tokens": 8216541.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.1912931193506733, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.06848036777228117, "learning_rate": 1.950016295863322e-05, "loss": 0.0027, "num_tokens": 8225475.0, "reward": 1.4375, "reward_std": 0.6232117414474487, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1914775871610404, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.08427120838314295, "learning_rate": 1.9498150315681992e-05, "loss": 0.0034, "num_tokens": 8235192.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.19166205497140748, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.10163014847785234, "learning_rate": 1.9496133733103114e-05, "loss": 0.0041, "num_tokens": 8241309.0, "reward": 1.376329779624939, "reward_std": 0.2520008087158203, "rewards/fixed_code_pass_all_test_reward/mean": 0.37632977962493896, "rewards/fixed_code_pass_all_test_reward/std": 0.2520008385181427, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.19184652278177458, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.1011506300419569, "learning_rate": 1.9494113211733018e-05, "loss": 0.004, "num_tokens": 8250602.0, "reward": 1.6423611640930176, "reward_std": 0.4954460859298706, "rewards/fixed_code_pass_all_test_reward/mean": 0.6423611044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.4954460859298706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 278.875, "completions/mean_terminated_length": 278.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.19203099059214168, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.12770461943000555, "learning_rate": 1.9492088752409782e-05, "loss": 0.0051, "num_tokens": 8260905.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 168.625, "completions/mean_terminated_length": 168.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.19221545840250875, "frac_reward_zero_std": 1.0, "grad_norm": 0.216796875, "kl": 0.08289631828665733, "learning_rate": 1.94900603559731e-05, "loss": 0.0033, "num_tokens": 8265086.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.19239992621287585, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.1274629170075059, "learning_rate": 1.9488028023264306e-05, "loss": 0.0051, "num_tokens": 8272597.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 249.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.19258439402324296, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.1908887350000441, "learning_rate": 1.9485991755126374e-05, "loss": 0.0076, "num_tokens": 8281115.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.19276886183361003, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.08177869580686092, "learning_rate": 1.94839515524039e-05, "loss": 0.0033, "num_tokens": 8287598.0, "reward": 0.8804347515106201, "reward_std": 0.3560745418071747, "rewards/fixed_code_pass_all_test_reward/mean": 0.005434782709926367, "rewards/fixed_code_pass_all_test_reward/std": 0.015371887944638729, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.19295332964397713, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.14065332151949406, "learning_rate": 1.948190741594312e-05, "loss": 0.0056, "num_tokens": 8293033.0, "reward": 1.3301887512207031, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.33018869161605835, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 490.625, "completions/mean_terminated_length": 490.625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.19313779745434423, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.059795851819217205, "learning_rate": 1.9479859346591893e-05, "loss": 0.0024, "num_tokens": 8303294.0, "reward": 1.2142856121063232, "reward_std": 0.1322600245475769, "rewards/fixed_code_pass_all_test_reward/mean": 0.2142857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.1322600245475769, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1933222652647113, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.17626169323921204, "learning_rate": 1.9477807345199717e-05, "loss": 0.0071, "num_tokens": 8312870.0, "reward": 1.5416667461395264, "reward_std": 0.5588394999504089, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2136233150959015, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 297.875, "completions/mean_terminated_length": 297.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.1935067330750784, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.06003303988836706, "learning_rate": 1.9475751412617714e-05, "loss": 0.0024, "num_tokens": 8319405.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 468.375, "completions/mean_terminated_length": 468.375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.1936912008854455, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.12252700608223677, "learning_rate": 1.9473691549698645e-05, "loss": 0.0049, "num_tokens": 8328896.0, "reward": 0.13362069427967072, "reward_std": 0.37793639302253723, "rewards/fixed_code_pass_all_test_reward/mean": 0.008620689623057842, "rewards/fixed_code_pass_all_test_reward/std": 0.024382993578910828, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.19387566869581258, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.17399546224623919, "learning_rate": 1.947162775729689e-05, "loss": 0.007, "num_tokens": 8337666.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.19406013650617968, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.0994268599897623, "learning_rate": 1.9469560036268475e-05, "loss": 0.004, "num_tokens": 8341508.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.19424460431654678, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.06831984873861074, "learning_rate": 1.946748838747104e-05, "loss": 0.0027, "num_tokens": 8350602.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.19442907212691385, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.11373131908476353, "learning_rate": 1.9465412811763862e-05, "loss": 0.0045, "num_tokens": 8355616.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.19461353993728095, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.06901945034042001, "learning_rate": 1.9463333310007842e-05, "loss": 0.0028, "num_tokens": 8359911.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1625.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 534.875, "completions/mean_terminated_length": 534.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.19479800774764802, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.13696023635566235, "learning_rate": 1.9461249883065522e-05, "loss": 0.0055, "num_tokens": 8366846.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.19498247555801512, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.05588448257185519, "learning_rate": 1.9459162531801048e-05, "loss": 0.0022, "num_tokens": 8371176.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 245.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.19516694336838222, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.10000312700867653, "learning_rate": 1.9457071257080216e-05, "loss": 0.004, "num_tokens": 8377701.0, "reward": 1.5726743936538696, "reward_std": 0.17266559600830078, "rewards/fixed_code_pass_all_test_reward/mean": 0.5726743936538696, "rewards/fixed_code_pass_all_test_reward/std": 0.17266561090946198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 771.5, "completions/mean_terminated_length": 589.1428833007812, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.1953514111787493, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.07598674204200506, "learning_rate": 1.9454976059770447e-05, "loss": 0.003, "num_tokens": 8387689.0, "reward": 1.5357142686843872, "reward_std": 0.7354021668434143, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.4040610194206238, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 113.375, "completions/mean_terminated_length": 113.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.1955358789891164, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "kl": 0.13652819115668535, "learning_rate": 1.945287694074077e-05, "loss": 0.0055, "num_tokens": 8391276.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1957203467994835, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.10060554929077625, "learning_rate": 1.9450773900861867e-05, "loss": 0.004, "num_tokens": 8396965.0, "reward": 1.3430850505828857, "reward_std": 0.18042594194412231, "rewards/fixed_code_pass_all_test_reward/mean": 0.3430851101875305, "rewards/fixed_code_pass_all_test_reward/std": 0.18042594194412231, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.19590481460985057, "frac_reward_zero_std": 1.0, "grad_norm": 0.059814453125, "kl": 0.03365208255127072, "learning_rate": 1.9448666941006028e-05, "loss": 0.0013, "num_tokens": 8404161.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 255.75, "completions/mean_terminated_length": 255.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.19608928242021767, "frac_reward_zero_std": 1.0, "grad_norm": 0.458984375, "kl": 0.10099860839545727, "learning_rate": 1.9446556062047173e-05, "loss": 0.004, "num_tokens": 8414383.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 212.125, "completions/mean_terminated_length": 212.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.19627375023058477, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.06283058132976294, "learning_rate": 1.9444441264860855e-05, "loss": 0.0025, "num_tokens": 8419272.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.19645821804095184, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.08383928518742323, "learning_rate": 1.9442322550324237e-05, "loss": 0.0034, "num_tokens": 8431521.0, "reward": 0.9073275327682495, "reward_std": 0.3666653335094452, "rewards/fixed_code_pass_all_test_reward/mean": 0.03232758492231369, "rewards/fixed_code_pass_all_test_reward/std": 0.014388327486813068, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.19664268585131894, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.1684959721751511, "learning_rate": 1.9440199919316125e-05, "loss": 0.0067, "num_tokens": 8435612.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 249.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.19682715366168604, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.08252969477325678, "learning_rate": 1.9438073372716935e-05, "loss": 0.0033, "num_tokens": 8445914.0, "reward": 1.4031955003738403, "reward_std": 0.6248806118965149, "rewards/fixed_code_pass_all_test_reward/mean": 0.5281955003738403, "rewards/fixed_code_pass_all_test_reward/std": 0.3968970775604248, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.19701162147205312, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.10234119929373264, "learning_rate": 1.9435942911408715e-05, "loss": 0.0041, "num_tokens": 8456183.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 163.75, "completions/mean_terminated_length": 163.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.19719608928242022, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.09044493455439806, "learning_rate": 1.9433808536275135e-05, "loss": 0.0036, "num_tokens": 8463517.0, "reward": 1.1875, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.19738055709278732, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "kl": 0.0669622584246099, "learning_rate": 1.9431670248201487e-05, "loss": 0.0027, "num_tokens": 8469230.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.1975650249031544, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.11207050457596779, "learning_rate": 1.9429528048074684e-05, "loss": 0.0045, "num_tokens": 8474641.0, "reward": 1.828125, "reward_std": 0.11451567709445953, "rewards/fixed_code_pass_all_test_reward/mean": 0.828125, "rewards/fixed_code_pass_all_test_reward/std": 0.11451567709445953, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.1977494927135215, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.044903200352564454, "learning_rate": 1.942738193678327e-05, "loss": 0.0018, "num_tokens": 8480938.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1979339605238886, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.11501698708161712, "learning_rate": 1.9425231915217395e-05, "loss": 0.0046, "num_tokens": 8490306.0, "reward": 0.9342105388641357, "reward_std": 0.5986067652702332, "rewards/fixed_code_pass_all_test_reward/mean": 0.18421052396297455, "rewards/fixed_code_pass_all_test_reward/std": 0.19692933559417725, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 306.75, "completions/mean_terminated_length": 306.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.19811842833425566, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.07145734410732985, "learning_rate": 1.9423077984268854e-05, "loss": 0.0029, "num_tokens": 8500344.0, "reward": 1.719827651977539, "reward_std": 0.36664360761642456, "rewards/fixed_code_pass_all_test_reward/mean": 0.7198275923728943, "rewards/fixed_code_pass_all_test_reward/std": 0.36664363741874695, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 310.875, "completions/mean_terminated_length": 310.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.19830289614462276, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.08191005839034915, "learning_rate": 1.9420920144831044e-05, "loss": 0.0033, "num_tokens": 8511799.0, "reward": 1.1158536672592163, "reward_std": 0.5267954468727112, "rewards/fixed_code_pass_all_test_reward/mean": 0.2408536672592163, "rewards/fixed_code_pass_all_test_reward/std": 0.2893061637878418, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.19848736395498986, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.058836750918999314, "learning_rate": 1.941875839779899e-05, "loss": 0.0024, "num_tokens": 8522306.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.19867183176535694, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.06875989306718111, "learning_rate": 1.941659274406934e-05, "loss": 0.0028, "num_tokens": 8526991.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.19885629957572404, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.1099200276657939, "learning_rate": 1.9414423184540366e-05, "loss": 0.0044, "num_tokens": 8532759.0, "reward": 1.1607143878936768, "reward_std": 0.22180680930614471, "rewards/fixed_code_pass_all_test_reward/mean": 0.1607142835855484, "rewards/fixed_code_pass_all_test_reward/std": 0.22180679440498352, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 119.625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.19904076738609114, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.10573464818298817, "learning_rate": 1.9412249720111946e-05, "loss": 0.0042, "num_tokens": 8536492.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.1992252351964582, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.0850978852249682, "learning_rate": 1.941007235168559e-05, "loss": 0.0034, "num_tokens": 8545567.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.1994097030068253, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.09937876462936401, "learning_rate": 1.9407891080164418e-05, "loss": 0.004, "num_tokens": 8554374.0, "reward": 1.4004237651824951, "reward_std": 0.49827852845191956, "rewards/fixed_code_pass_all_test_reward/mean": 0.40042373538017273, "rewards/fixed_code_pass_all_test_reward/std": 0.49827852845191956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.1995941708171924, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.051675887778401375, "learning_rate": 1.940570590645318e-05, "loss": 0.0021, "num_tokens": 8560439.0, "reward": 1.5135869979858398, "reward_std": 0.3528246283531189, "rewards/fixed_code_pass_all_test_reward/mean": 0.6385869979858398, "rewards/fixed_code_pass_all_test_reward/std": 0.05802760645747185, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 222.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.19977863862755948, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.07573381648398936, "learning_rate": 1.940351683145824e-05, "loss": 0.003, "num_tokens": 8568732.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 165.375, "completions/mean_terminated_length": 165.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.19996310643792659, "frac_reward_zero_std": 1.0, "grad_norm": 0.4140625, "kl": 0.07259971601888537, "learning_rate": 1.9401323856087573e-05, "loss": 0.0029, "num_tokens": 8576831.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.20014757424829369, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.1291002780199051, "learning_rate": 1.9399126981250786e-05, "loss": 0.0052, "num_tokens": 8583925.0, "reward": 1.4744898080825806, "reward_std": 0.321711927652359, "rewards/fixed_code_pass_all_test_reward/mean": 0.47448980808258057, "rewards/fixed_code_pass_all_test_reward/std": 0.3217118978500366, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.20033204205866076, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.10140589997172356, "learning_rate": 1.9396926207859085e-05, "loss": 0.0041, "num_tokens": 8592033.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 424.625, "completions/mean_terminated_length": 424.625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.20051650986902786, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.052184017607942224, "learning_rate": 1.939472153682531e-05, "loss": 0.0021, "num_tokens": 8605878.0, "reward": 1.5625, "reward_std": 0.47715675830841064, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.47715675830841064, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 206.5, "completions/mean_terminated_length": 206.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.20070097767939493, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.05246146768331528, "learning_rate": 1.9392512969063914e-05, "loss": 0.0021, "num_tokens": 8614090.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.20088544548976203, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.04076973721385002, "learning_rate": 1.9390300505490957e-05, "loss": 0.0016, "num_tokens": 8622659.0, "reward": 1.412500023841858, "reward_std": 0.35632047057151794, "rewards/fixed_code_pass_all_test_reward/mean": 0.5375000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.14078859984874725, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 423.375, "completions/mean_terminated_length": 423.375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.20106991330012913, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.10173243377357721, "learning_rate": 1.938808414702412e-05, "loss": 0.0041, "num_tokens": 8636990.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.2012543811104962, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.041787998052313924, "learning_rate": 1.9385863894582706e-05, "loss": 0.0017, "num_tokens": 8643851.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.2014388489208633, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.11359761422500014, "learning_rate": 1.9383639749087626e-05, "loss": 0.0045, "num_tokens": 8647727.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2016233167312304, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.08622305933386087, "learning_rate": 1.9381411711461408e-05, "loss": 0.0034, "num_tokens": 8652241.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.20180778454159748, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.09322405327111483, "learning_rate": 1.937917978262819e-05, "loss": 0.0037, "num_tokens": 8660386.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 205.0, "completions/mean_terminated_length": 205.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.20199225235196458, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.08059562649577856, "learning_rate": 1.937694396351373e-05, "loss": 0.0032, "num_tokens": 8664970.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.20217672016233168, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.11936967354267836, "learning_rate": 1.9374704255045405e-05, "loss": 0.0048, "num_tokens": 8673598.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.20236118797269875, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.11459976434707642, "learning_rate": 1.9372460658152185e-05, "loss": 0.0046, "num_tokens": 8677644.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 384.375, "completions/mean_terminated_length": 384.375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.20254565578306585, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.04522628663107753, "learning_rate": 1.9370213173764674e-05, "loss": 0.0018, "num_tokens": 8688487.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.20273012359343295, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.06186259537935257, "learning_rate": 1.9367961802815078e-05, "loss": 0.0025, "num_tokens": 8696221.0, "reward": 1.9605263471603394, "reward_std": 0.11164842545986176, "rewards/fixed_code_pass_all_test_reward/mean": 0.9605263471603394, "rewards/fixed_code_pass_all_test_reward/std": 0.11164844036102295, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.20291459140380003, "frac_reward_zero_std": 1.0, "grad_norm": 0.56640625, "kl": 0.155505551956594, "learning_rate": 1.9365706546237212e-05, "loss": 0.0062, "num_tokens": 8700125.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.20309905921416713, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.07354765757918358, "learning_rate": 1.9363447404966517e-05, "loss": 0.0029, "num_tokens": 8709554.0, "reward": 1.5332279205322266, "reward_std": 0.3476118743419647, "rewards/fixed_code_pass_all_test_reward/mean": 0.658227801322937, "rewards/fixed_code_pass_all_test_reward/std": 0.05538296699523926, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.20328352702453423, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.07172942161560059, "learning_rate": 1.936118437994003e-05, "loss": 0.0029, "num_tokens": 8714200.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.2034679948349013, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.04819859517738223, "learning_rate": 1.9358917472096407e-05, "loss": 0.0019, "num_tokens": 8719457.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.2036524626452684, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0883020511828363, "learning_rate": 1.935664668237591e-05, "loss": 0.0035, "num_tokens": 8725502.0, "reward": 1.2554347515106201, "reward_std": 0.3371317982673645, "rewards/fixed_code_pass_all_test_reward/mean": 0.2554347813129425, "rewards/fixed_code_pass_all_test_reward/std": 0.3371317982673645, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 128.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.2038369304556355, "frac_reward_zero_std": 1.0, "grad_norm": 0.24609375, "kl": 0.09286965057253838, "learning_rate": 1.9354372011720418e-05, "loss": 0.0037, "num_tokens": 8730251.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 192.25, "completions/mean_terminated_length": 192.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.20402139826600257, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.09996505407616496, "learning_rate": 1.9352093461073408e-05, "loss": 0.004, "num_tokens": 8739597.0, "reward": 1.7604167461395264, "reward_std": 0.376485139131546, "rewards/fixed_code_pass_all_test_reward/mean": 0.7604166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3764851689338684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.20420586607636967, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.09993414347991347, "learning_rate": 1.934981103137998e-05, "loss": 0.004, "num_tokens": 8747451.0, "reward": 1.78166663646698, "reward_std": 0.4050436317920685, "rewards/fixed_code_pass_all_test_reward/mean": 0.78166663646698, "rewards/fixed_code_pass_all_test_reward/std": 0.4050436019897461, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.20439033388673677, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.10278441710397601, "learning_rate": 1.9347524723586836e-05, "loss": 0.0041, "num_tokens": 8756041.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 314.375, "completions/mean_terminated_length": 314.375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.20457480169710385, "frac_reward_zero_std": 1.0, "grad_norm": 0.25, "kl": 0.07223545643500984, "learning_rate": 1.9345234538642287e-05, "loss": 0.0029, "num_tokens": 8763380.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 279.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.20475926950747095, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.04004163108766079, "learning_rate": 1.9342940477496247e-05, "loss": 0.0016, "num_tokens": 8770297.0, "reward": 1.2352941036224365, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.23529411852359772, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.20494373731783805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.053103114012628794, "learning_rate": 1.934064254110025e-05, "loss": 0.0021, "num_tokens": 8778843.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 437.375, "completions/mean_terminated_length": 437.375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.20512820512820512, "frac_reward_zero_std": 1.0, "grad_norm": 0.052001953125, "kl": 0.03392018657177687, "learning_rate": 1.9338340730407427e-05, "loss": 0.0014, "num_tokens": 8793446.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.20531267293857222, "frac_reward_zero_std": 1.0, "grad_norm": 0.7265625, "kl": 0.10047763912007213, "learning_rate": 1.9336035046372518e-05, "loss": 0.004, "num_tokens": 8798339.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.20549714074893932, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.07786719407886267, "learning_rate": 1.9333725489951876e-05, "loss": 0.0031, "num_tokens": 8810172.0, "reward": 1.0750000476837158, "reward_std": 0.4652188718318939, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.18516403436660767, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2056816085593064, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.08910711389034986, "learning_rate": 1.9331412062103448e-05, "loss": 0.0036, "num_tokens": 8814297.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 299.875, "completions/mean_terminated_length": 299.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.2058660763696735, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.052134092431515455, "learning_rate": 1.93290947637868e-05, "loss": 0.0021, "num_tokens": 8821624.0, "reward": 0.9921875, "reward_std": 0.40975481271743774, "rewards/fixed_code_pass_all_test_reward/mean": 0.1171875, "rewards/fixed_code_pass_all_test_reward/std": 0.09704047441482544, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 312.75, "completions/mean_terminated_length": 312.75, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.2060505441800406, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.04214198840782046, "learning_rate": 1.9326773595963097e-05, "loss": 0.0017, "num_tokens": 8829326.0, "reward": 1.6171875, "reward_std": 0.15467961132526398, "rewards/fixed_code_pass_all_test_reward/mean": 0.6171875, "rewards/fixed_code_pass_all_test_reward/std": 0.15467961132526398, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.20623501199040767, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.07817874662578106, "learning_rate": 1.9324448559595104e-05, "loss": 0.0031, "num_tokens": 8838409.0, "reward": 1.5125000476837158, "reward_std": 0.5221863389015198, "rewards/fixed_code_pass_all_test_reward/mean": 0.512499988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.5221863389015198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 352.875, "completions/mean_terminated_length": 352.875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.20641947980077477, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.09004027396440506, "learning_rate": 1.93221196556472e-05, "loss": 0.0036, "num_tokens": 8848560.0, "reward": 1.5120192766189575, "reward_std": 0.46198898553848267, "rewards/fixed_code_pass_all_test_reward/mean": 0.5120192766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.46198898553848267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 141.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.20660394761114187, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.056251227390021086, "learning_rate": 1.9319786885085366e-05, "loss": 0.0023, "num_tokens": 8852419.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 365.875, "completions/mean_terminated_length": 365.875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.20678841542150894, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.04185466980561614, "learning_rate": 1.9317450248877178e-05, "loss": 0.0017, "num_tokens": 8859762.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 397.5, "completions/mean_terminated_length": 397.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.20697288323187604, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.04131354880519211, "learning_rate": 1.9315109747991826e-05, "loss": 0.0017, "num_tokens": 8867934.0, "reward": 1.1411290168762207, "reward_std": 0.35585838556289673, "rewards/fixed_code_pass_all_test_reward/mean": 0.2661290168762207, "rewards/fixed_code_pass_all_test_reward/std": 0.16425840556621552, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.20715735104224312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.05889161676168442, "learning_rate": 1.9312765383400106e-05, "loss": 0.0024, "num_tokens": 8874832.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 316.5, "completions/mean_terminated_length": 316.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.20734181885261022, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.10096472688019276, "learning_rate": 1.9310417156074396e-05, "loss": 0.004, "num_tokens": 8884300.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.20752628666297732, "frac_reward_zero_std": 1.0, "grad_norm": 0.1865234375, "kl": 0.07569208275526762, "learning_rate": 1.9308065066988702e-05, "loss": 0.003, "num_tokens": 8889510.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.2077107544733444, "frac_reward_zero_std": 1.0, "grad_norm": 0.2041015625, "kl": 0.0877193845808506, "learning_rate": 1.9305709117118617e-05, "loss": 0.0035, "num_tokens": 8897999.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 259.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.2078952222837115, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.04750887886621058, "learning_rate": 1.930334930744133e-05, "loss": 0.0019, "num_tokens": 8903411.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.2080796900940786, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "kl": 0.10334835248067975, "learning_rate": 1.930098563893565e-05, "loss": 0.0041, "num_tokens": 8912256.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.20826415790444566, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.15986261470243335, "learning_rate": 1.929861811258197e-05, "loss": 0.0064, "num_tokens": 8916910.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.20844862571481276, "frac_reward_zero_std": 1.0, "grad_norm": 0.365234375, "kl": 0.17134613916277885, "learning_rate": 1.9296246729362293e-05, "loss": 0.0069, "num_tokens": 8920662.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.20863309352517986, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.05756544368341565, "learning_rate": 1.929387149026021e-05, "loss": 0.0023, "num_tokens": 8929856.0, "reward": 1.6931817531585693, "reward_std": 0.4234492778778076, "rewards/fixed_code_pass_all_test_reward/mean": 0.6931818723678589, "rewards/fixed_code_pass_all_test_reward/std": 0.4234493374824524, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 646.625, "completions/mean_terminated_length": 646.625, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.20881756133554694, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.07316334405913949, "learning_rate": 1.9291492396260923e-05, "loss": 0.0029, "num_tokens": 8946077.0, "reward": 0.7321428060531616, "reward_std": 0.6177051663398743, "rewards/fixed_code_pass_all_test_reward/mean": 0.1071428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.1478712111711502, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.20900202914591404, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.08935952326282859, "learning_rate": 1.928910944835123e-05, "loss": 0.0036, "num_tokens": 8950177.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 341.875, "completions/mean_terminated_length": 341.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.20918649695628114, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.09033359494060278, "learning_rate": 1.9286722647519527e-05, "loss": 0.0036, "num_tokens": 8959712.0, "reward": 1.8674242496490479, "reward_std": 0.10978318750858307, "rewards/fixed_code_pass_all_test_reward/mean": 0.8674242496490479, "rewards/fixed_code_pass_all_test_reward/std": 0.10978314280509949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 290.625, "completions/mean_terminated_length": 290.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2093709647666482, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.07477276725694537, "learning_rate": 1.9284331994755807e-05, "loss": 0.003, "num_tokens": 8969565.0, "reward": 1.75, "reward_std": 0.32732683420181274, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.13363061845302582, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2095554325770153, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.06614039186388254, "learning_rate": 1.9281937491051658e-05, "loss": 0.0026, "num_tokens": 8977801.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.2097399003873824, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.09936216985806823, "learning_rate": 1.9279539137400268e-05, "loss": 0.004, "num_tokens": 8987316.0, "reward": 1.3819444179534912, "reward_std": 0.7401245832443237, "rewards/fixed_code_pass_all_test_reward/mean": 0.5069444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.5272031426429749, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 299.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.20992436819774948, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.12133851833641529, "learning_rate": 1.927713693479643e-05, "loss": 0.0049, "num_tokens": 8997785.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.21010883600811658, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.06688337726518512, "learning_rate": 1.927473088423652e-05, "loss": 0.0027, "num_tokens": 9002821.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 247.625, "completions/mean_terminated_length": 247.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.21029330381848368, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.0918566775508225, "learning_rate": 1.9272320986718512e-05, "loss": 0.0037, "num_tokens": 9009810.0, "reward": 1.34375, "reward_std": 0.6399986147880554, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 199.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.21047777162885076, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.09602054860442877, "learning_rate": 1.926990724324199e-05, "loss": 0.0038, "num_tokens": 9017110.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.21066223943921786, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.11834167502820492, "learning_rate": 1.926748965480811e-05, "loss": 0.0047, "num_tokens": 9024192.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 298.0, "completions/mean_terminated_length": 298.0, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.21084670724958496, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.04809158691205084, "learning_rate": 1.9265068222419646e-05, "loss": 0.0019, "num_tokens": 9033160.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.21103117505995203, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.043440277921035886, "learning_rate": 1.926264294708095e-05, "loss": 0.0017, "num_tokens": 9039627.0, "reward": 1.4437499046325684, "reward_std": 0.35047441720962524, "rewards/fixed_code_pass_all_test_reward/mean": 0.4437500238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.35047444701194763, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.21121564287031913, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.07906628958880901, "learning_rate": 1.926021382979798e-05, "loss": 0.0032, "num_tokens": 9046267.0, "reward": 1.5, "reward_std": 0.46861347556114197, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.44915637373924255, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 172.75, "completions/mean_terminated_length": 172.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.21140011068068623, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.18519817385822535, "learning_rate": 1.925778087157827e-05, "loss": 0.0074, "num_tokens": 9052297.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 356.25, "completions/mean_terminated_length": 356.25, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.2115845784910533, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.03509332315297797, "learning_rate": 1.925534407343097e-05, "loss": 0.0014, "num_tokens": 9059491.0, "reward": 1.53125, "reward_std": 0.38816189765930176, "rewards/fixed_code_pass_all_test_reward/mean": 0.53125, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 378.625, "completions/mean_terminated_length": 378.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.2117690463014204, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.12579061742872, "learning_rate": 1.925290343636681e-05, "loss": 0.005, "num_tokens": 9070928.0, "reward": 0.875, "reward_std": 0.5739033818244934, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.208927720785141, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 345.25, "completions/mean_terminated_length": 345.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2119535141117875, "frac_reward_zero_std": 1.0, "grad_norm": 0.19921875, "kl": 0.10768410749733448, "learning_rate": 1.9250458961398106e-05, "loss": 0.0043, "num_tokens": 9080970.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.21213798192215458, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.06505818944424391, "learning_rate": 1.924801064953878e-05, "loss": 0.0026, "num_tokens": 9090737.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 534.625, "completions/mean_terminated_length": 534.625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.21232244973252168, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.06824079726357013, "learning_rate": 1.9245558501804333e-05, "loss": 0.0027, "num_tokens": 9102094.0, "reward": 0.68359375, "reward_std": 0.5674847364425659, "rewards/fixed_code_pass_all_test_reward/mean": 0.05859375, "rewards/fixed_code_pass_all_test_reward/std": 0.06291713565587997, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 510.5, "completions/mean_terminated_length": 510.5, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.21250691754288878, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.04672255436889827, "learning_rate": 1.924310251921187e-05, "loss": 0.0019, "num_tokens": 9112546.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 363.25, "completions/mean_terminated_length": 363.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.21269138535325585, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.08460514293983579, "learning_rate": 1.9240642702780073e-05, "loss": 0.0034, "num_tokens": 9121220.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 389.375, "completions/mean_terminated_length": 389.375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.21287585316362295, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.08459047437645495, "learning_rate": 1.9238179053529227e-05, "loss": 0.0034, "num_tokens": 9128751.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.21306032097399003, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.10655240062624216, "learning_rate": 1.9235711572481196e-05, "loss": 0.0043, "num_tokens": 9133041.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.21324478878435713, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.11353489104658365, "learning_rate": 1.923324026065944e-05, "loss": 0.0045, "num_tokens": 9140936.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 343.0, "completions/mean_terminated_length": 343.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.21342925659472423, "frac_reward_zero_std": 1.0, "grad_norm": 0.21484375, "kl": 0.05724132969044149, "learning_rate": 1.9230765119089007e-05, "loss": 0.0023, "num_tokens": 9147672.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2136137244050913, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.11053066654130816, "learning_rate": 1.9228286148796528e-05, "loss": 0.0044, "num_tokens": 9170653.0, "reward": 1.6927711963653564, "reward_std": 0.40130218863487244, "rewards/fixed_code_pass_all_test_reward/mean": 0.6927710771560669, "rewards/fixed_code_pass_all_test_reward/std": 0.4013022482395172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 364.25, "completions/mean_terminated_length": 364.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.2137981922154584, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.09856203477829695, "learning_rate": 1.922580335081023e-05, "loss": 0.0039, "num_tokens": 9181383.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 211.375, "completions/mean_terminated_length": 211.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2139826600258255, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.06453670002520084, "learning_rate": 1.9223316726159923e-05, "loss": 0.0026, "num_tokens": 9185850.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.21416712783619257, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.07621222687885165, "learning_rate": 1.9220826275877003e-05, "loss": 0.003, "num_tokens": 9194201.0, "reward": 1.6927083730697632, "reward_std": 0.36929354071617126, "rewards/fixed_code_pass_all_test_reward/mean": 0.6927083730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.36929357051849365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 699.625, "completions/mean_terminated_length": 699.625, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.21435159564655967, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.08256300818175077, "learning_rate": 1.921833200099446e-05, "loss": 0.0033, "num_tokens": 9210806.0, "reward": 0.9285714626312256, "reward_std": 0.4581621289253235, "rewards/fixed_code_pass_all_test_reward/mean": 0.1785714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.1266293227672577, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.21453606345692677, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.06607956485822797, "learning_rate": 1.921583390254686e-05, "loss": 0.0026, "num_tokens": 9215583.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 447.25, "completions/mean_terminated_length": 447.25, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.21472053126729385, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.08663049200549722, "learning_rate": 1.921333198157036e-05, "loss": 0.0035, "num_tokens": 9227209.0, "reward": 1.045454502105713, "reward_std": 0.0971858873963356, "rewards/fixed_code_pass_all_test_reward/mean": 0.04545454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.09718590974807739, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.21490499907766095, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.07852879632264376, "learning_rate": 1.921082623910271e-05, "loss": 0.0031, "num_tokens": 9232306.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.21508946688802805, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.1804099241271615, "learning_rate": 1.920831667618323e-05, "loss": 0.0072, "num_tokens": 9240645.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.21527393469839512, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.05292487354017794, "learning_rate": 1.920580329385284e-05, "loss": 0.0021, "num_tokens": 9246024.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 329.25, "completions/mean_terminated_length": 329.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.21545840250876222, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.11557532520964742, "learning_rate": 1.920328609315403e-05, "loss": 0.0046, "num_tokens": 9255634.0, "reward": 1.35869562625885, "reward_std": 0.21394792199134827, "rewards/fixed_code_pass_all_test_reward/mean": 0.3586956560611725, "rewards/fixed_code_pass_all_test_reward/std": 0.21394789218902588, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.21564287031912932, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.08000065898522735, "learning_rate": 1.920076507513088e-05, "loss": 0.0032, "num_tokens": 9264658.0, "reward": 1.450657844543457, "reward_std": 0.14167122542858124, "rewards/fixed_code_pass_all_test_reward/mean": 0.4506579041481018, "rewards/fixed_code_pass_all_test_reward/std": 0.14167122542858124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2158273381294964, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06496397475712001, "learning_rate": 1.9198240240829063e-05, "loss": 0.0026, "num_tokens": 9268375.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.2160118059398635, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.06698041781783104, "learning_rate": 1.9195711591295814e-05, "loss": 0.0027, "num_tokens": 9277143.0, "reward": 1.868990421295166, "reward_std": 0.06272414326667786, "rewards/fixed_code_pass_all_test_reward/mean": 0.8689903616905212, "rewards/fixed_code_pass_all_test_reward/std": 0.06272414326667786, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 118.125, "completions/mean_terminated_length": 118.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.2161962737502306, "frac_reward_zero_std": 1.0, "grad_norm": 0.25, "kl": 0.12592246942222118, "learning_rate": 1.919317912757997e-05, "loss": 0.005, "num_tokens": 9280744.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 199.375, "completions/mean_terminated_length": 199.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.21638074156059767, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.09479475673288107, "learning_rate": 1.919064285073194e-05, "loss": 0.0038, "num_tokens": 9290899.0, "reward": 1.2245370149612427, "reward_std": 0.257636159658432, "rewards/fixed_code_pass_all_test_reward/mean": 0.22453704476356506, "rewards/fixed_code_pass_all_test_reward/std": 0.2576361894607544, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.21656520937096477, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.12758992705494165, "learning_rate": 1.918810276180372e-05, "loss": 0.0051, "num_tokens": 9298716.0, "reward": 1.2346938848495483, "reward_std": 0.6889725923538208, "rewards/fixed_code_pass_all_test_reward/mean": 0.35969388484954834, "rewards/fixed_code_pass_all_test_reward/std": 0.4969039261341095, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.21674967718133187, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.056756129022687674, "learning_rate": 1.9185558861848875e-05, "loss": 0.0023, "num_tokens": 9303449.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.21693414499169894, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.06834268337115645, "learning_rate": 1.9183011151922568e-05, "loss": 0.0027, "num_tokens": 9309912.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.21711861280206604, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.06089191185310483, "learning_rate": 1.9180459633081534e-05, "loss": 0.0024, "num_tokens": 9314583.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.21730308061243314, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.08773162425495684, "learning_rate": 1.9177904306384085e-05, "loss": 0.0035, "num_tokens": 9321302.0, "reward": 1.0681817531585693, "reward_std": 0.19284728169441223, "rewards/fixed_code_pass_all_test_reward/mean": 0.06818182021379471, "rewards/fixed_code_pass_all_test_reward/std": 0.1928473263978958, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.21748754842280021, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.07955946447327733, "learning_rate": 1.9175345172890113e-05, "loss": 0.0032, "num_tokens": 9328063.0, "reward": 1.741279125213623, "reward_std": 0.05755512788891792, "rewards/fixed_code_pass_all_test_reward/mean": 0.7412790656089783, "rewards/fixed_code_pass_all_test_reward/std": 0.057555194944143295, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.21767201623316731, "frac_reward_zero_std": 1.0, "grad_norm": 0.2060546875, "kl": 0.11199323926120996, "learning_rate": 1.9172782233661097e-05, "loss": 0.0045, "num_tokens": 9336714.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 411.375, "completions/mean_terminated_length": 411.375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.21785648404353442, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.07628715946339071, "learning_rate": 1.9170215489760084e-05, "loss": 0.0031, "num_tokens": 9345509.0, "reward": 1.5, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 328.25, "completions/mean_terminated_length": 328.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.2180409518539015, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.04754471220076084, "learning_rate": 1.916764494225171e-05, "loss": 0.0019, "num_tokens": 9352119.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 99.0, "completions/mean_terminated_length": 99.0, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.2182254196642686, "frac_reward_zero_std": 1.0, "grad_norm": 0.439453125, "kl": 0.08974086306989193, "learning_rate": 1.9165070592202175e-05, "loss": 0.0036, "num_tokens": 9355727.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 407.875, "completions/mean_terminated_length": 407.875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.2184098874746357, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.06104309612419456, "learning_rate": 1.9162492440679272e-05, "loss": 0.0024, "num_tokens": 9364350.0, "reward": 1.442307710647583, "reward_std": 0.2738998532295227, "rewards/fixed_code_pass_all_test_reward/mean": 0.442307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.2738998532295227, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.21859435528500276, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.030580809339880943, "learning_rate": 1.9159910488752356e-05, "loss": 0.0012, "num_tokens": 9369763.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 314.0, "completions/mean_terminated_length": 314.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.21877882309536986, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.08837619470432401, "learning_rate": 1.9157324737492366e-05, "loss": 0.0035, "num_tokens": 9379011.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.21896329090573696, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.10173805756494403, "learning_rate": 1.9154735187971817e-05, "loss": 0.0041, "num_tokens": 9385110.0, "reward": 1.311170220375061, "reward_std": 0.2783292829990387, "rewards/fixed_code_pass_all_test_reward/mean": 0.31117022037506104, "rewards/fixed_code_pass_all_test_reward/std": 0.2783292531967163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.21914775871610404, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.03733280207961798, "learning_rate": 1.9152141841264797e-05, "loss": 0.0015, "num_tokens": 9391490.0, "reward": 1.7589285373687744, "reward_std": 0.1319151371717453, "rewards/fixed_code_pass_all_test_reward/mean": 0.7589285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.1319151371717453, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.21933222652647114, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.13287966046482325, "learning_rate": 1.9149544698446974e-05, "loss": 0.0053, "num_tokens": 9395436.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 157.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2195166943368382, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.060172553174197674, "learning_rate": 1.914694376059558e-05, "loss": 0.0024, "num_tokens": 9399478.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.2197011621472053, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.057845264207571745, "learning_rate": 1.9144339028789436e-05, "loss": 0.0023, "num_tokens": 9404645.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 382.625, "completions/mean_terminated_length": 382.625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.2198856299575724, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.03631381154991686, "learning_rate": 1.9141730504108923e-05, "loss": 0.0015, "num_tokens": 9413338.0, "reward": 1.3000000715255737, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 452.25, "completions/mean_terminated_length": 452.25, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.22007009776793948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0576171875, "kl": 0.026743174763396382, "learning_rate": 1.9139118187636e-05, "loss": 0.0011, "num_tokens": 9425420.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 203.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.22025456557830658, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.08682212047278881, "learning_rate": 1.9136502080454202e-05, "loss": 0.0035, "num_tokens": 9431106.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 257.125, "completions/mean_terminated_length": 257.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.22043903338867368, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.09308147989213467, "learning_rate": 1.9133882183648636e-05, "loss": 0.0037, "num_tokens": 9441283.0, "reward": 1.0271739959716797, "reward_std": 0.02250213921070099, "rewards/fixed_code_pass_all_test_reward/mean": 0.02717391401529312, "rewards/fixed_code_pass_all_test_reward/std": 0.02250213921070099, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.22062350119904076, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.023478888208046556, "learning_rate": 1.9131258498305976e-05, "loss": 0.0009, "num_tokens": 9445939.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 118.625, "completions/mean_terminated_length": 118.625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.22080796900940786, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.06952975993044674, "learning_rate": 1.912863102551447e-05, "loss": 0.0028, "num_tokens": 9449632.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 418.375, "completions/mean_terminated_length": 418.375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.22099243681977496, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.05820470885373652, "learning_rate": 1.9125999766363935e-05, "loss": 0.0023, "num_tokens": 9458699.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.22117690463014203, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.0671815425157547, "learning_rate": 1.912336472194576e-05, "loss": 0.0027, "num_tokens": 9462684.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 269.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.22136137244050913, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.07939829817041755, "learning_rate": 1.9120725893352913e-05, "loss": 0.0032, "num_tokens": 9472373.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.22154584025087623, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.11392215825617313, "learning_rate": 1.9118083281679915e-05, "loss": 0.0046, "num_tokens": 9480407.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 277.75, "completions/mean_terminated_length": 277.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.2217303080612433, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.09361373260617256, "learning_rate": 1.9115436888022862e-05, "loss": 0.0037, "num_tokens": 9486821.0, "reward": 1.5357142686843872, "reward_std": 0.501859188079834, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.501859188079834, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 194.875, "completions/mean_terminated_length": 194.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.2219147758716104, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.06003462173976004, "learning_rate": 1.911278671347943e-05, "loss": 0.0024, "num_tokens": 9492612.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.2220992436819775, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.023432772373780608, "learning_rate": 1.9110132759148847e-05, "loss": 0.0009, "num_tokens": 9498037.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 441.75, "completions/mean_terminated_length": 441.75, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.22228371149234458, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.02403651108033955, "learning_rate": 1.910747502613192e-05, "loss": 0.001, "num_tokens": 9507803.0, "reward": 1.75, "reward_std": 0.34503278136253357, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.22246817930271168, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.053485218435525894, "learning_rate": 1.9104813515531017e-05, "loss": 0.0021, "num_tokens": 9512319.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 265.875, "completions/mean_terminated_length": 265.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.22265264711307878, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.06199831352569163, "learning_rate": 1.9102148228450074e-05, "loss": 0.0025, "num_tokens": 9521646.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 277.125, "completions/mean_terminated_length": 277.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.22283711492344585, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.08981215243693441, "learning_rate": 1.90994791659946e-05, "loss": 0.0036, "num_tokens": 9527623.0, "reward": 1.0357142686843872, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 267.125, "completions/mean_terminated_length": 267.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.22302158273381295, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.07535909581929445, "learning_rate": 1.9096806329271657e-05, "loss": 0.003, "num_tokens": 9538072.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.22320605054418005, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.05056330165825784, "learning_rate": 1.9094129719389886e-05, "loss": 0.002, "num_tokens": 9542805.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.22339051835454712, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.07968293549492955, "learning_rate": 1.9091449337459484e-05, "loss": 0.0032, "num_tokens": 9549192.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.22357498616491422, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.07456178264692426, "learning_rate": 1.9088765184592217e-05, "loss": 0.003, "num_tokens": 9554408.0, "reward": 1.235795497894287, "reward_std": 0.35720863938331604, "rewards/fixed_code_pass_all_test_reward/mean": 0.23579546809196472, "rewards/fixed_code_pass_all_test_reward/std": 0.3572086691856384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.22375945397528132, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.14707824494689703, "learning_rate": 1.9086077261901412e-05, "loss": 0.0059, "num_tokens": 9564526.0, "reward": 1.290816307067871, "reward_std": 0.3673064708709717, "rewards/fixed_code_pass_all_test_reward/mean": 0.2908163368701935, "rewards/fixed_code_pass_all_test_reward/std": 0.3673064708709717, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2239439217856484, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.08852860471233726, "learning_rate": 1.9083385570501965e-05, "loss": 0.0035, "num_tokens": 9573348.0, "reward": 1.6796875, "reward_std": 0.4471980035305023, "rewards/fixed_code_pass_all_test_reward/mean": 0.6796875, "rewards/fixed_code_pass_all_test_reward/std": 0.4471980035305023, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 105.25, "completions/mean_terminated_length": 105.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2241283895960155, "frac_reward_zero_std": 0.0, "grad_norm": 3.4375, "kl": 0.07919905195012689, "learning_rate": 1.9080690111510323e-05, "loss": 0.0032, "num_tokens": 9576894.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.2243128574063826, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.09398824395611882, "learning_rate": 1.9077990886044512e-05, "loss": 0.0038, "num_tokens": 9580852.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 244.375, "completions/mean_terminated_length": 244.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.22449732521674967, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.09226812003180385, "learning_rate": 1.9075287895224112e-05, "loss": 0.0037, "num_tokens": 9586983.0, "reward": 1.0357142686843872, "reward_std": 0.06613001227378845, "rewards/fixed_code_pass_all_test_reward/mean": 0.0357142873108387, "rewards/fixed_code_pass_all_test_reward/std": 0.06613001227378845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 370.625, "completions/mean_terminated_length": 370.625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.22468179302711677, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.03863282594829798, "learning_rate": 1.9072581140170264e-05, "loss": 0.0015, "num_tokens": 9594156.0, "reward": 1.837499976158142, "reward_std": 0.25599944591522217, "rewards/fixed_code_pass_all_test_reward/mean": 0.8374999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.25599944591522217, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 154.125, "completions/mean_terminated_length": 154.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.22486626083748387, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.04521885816939175, "learning_rate": 1.906987062200567e-05, "loss": 0.0018, "num_tokens": 9598261.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 229.25, "completions/mean_terminated_length": 229.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.22505072864785094, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.12131957057863474, "learning_rate": 1.9067156341854595e-05, "loss": 0.0049, "num_tokens": 9607759.0, "reward": 1.7833333015441895, "reward_std": 0.04714040830731392, "rewards/fixed_code_pass_all_test_reward/mean": 0.7833333015441895, "rewards/fixed_code_pass_all_test_reward/std": 0.0471404492855072, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 350.25, "completions/mean_terminated_length": 350.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.22523519645821805, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.05988248996436596, "learning_rate": 1.9064438300842868e-05, "loss": 0.0024, "num_tokens": 9618377.0, "reward": 1.6780303716659546, "reward_std": 0.26265057921409607, "rewards/fixed_code_pass_all_test_reward/mean": 0.6780303120613098, "rewards/fixed_code_pass_all_test_reward/std": 0.2626505494117737, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.22541966426858512, "frac_reward_zero_std": 1.0, "grad_norm": 0.1513671875, "kl": 0.07479103235527873, "learning_rate": 1.9061716500097865e-05, "loss": 0.003, "num_tokens": 9623498.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.22560413207895222, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.09084597462788224, "learning_rate": 1.9058990940748535e-05, "loss": 0.0036, "num_tokens": 9631572.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 457.5, "completions/mean_terminated_length": 457.5, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.22578859988931932, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.055321923922747374, "learning_rate": 1.9056261623925386e-05, "loss": 0.0022, "num_tokens": 9646344.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 302.25, "completions/mean_terminated_length": 302.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2259730676996864, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.039653665386140347, "learning_rate": 1.905352855076047e-05, "loss": 0.0016, "num_tokens": 9651938.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.2261575355100535, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.09576788148842752, "learning_rate": 1.9050791722387413e-05, "loss": 0.0038, "num_tokens": 9658244.0, "reward": 1.9305555820465088, "reward_std": 0.05750548839569092, "rewards/fixed_code_pass_all_test_reward/mean": 0.930555522441864, "rewards/fixed_code_pass_all_test_reward/std": 0.05750546231865883, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 220.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2263420033204206, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.07580615067854524, "learning_rate": 1.9048051139941394e-05, "loss": 0.003, "num_tokens": 9668309.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 352.625, "completions/mean_terminated_length": 352.625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.22652647113078767, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.07414681650698185, "learning_rate": 1.904530680455914e-05, "loss": 0.003, "num_tokens": 9676362.0, "reward": 1.4821429252624512, "reward_std": 0.27465859055519104, "rewards/fixed_code_pass_all_test_reward/mean": 0.4821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.27465856075286865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.22671093894115477, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.15836967714130878, "learning_rate": 1.904255871737895e-05, "loss": 0.0063, "num_tokens": 9683443.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.22689540675152187, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.10350457485765219, "learning_rate": 1.9039806879540666e-05, "loss": 0.0041, "num_tokens": 9693031.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.22707987456188894, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.08929629065096378, "learning_rate": 1.9037051292185693e-05, "loss": 0.0036, "num_tokens": 9698128.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.22726434237225604, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.08719458244740963, "learning_rate": 1.903429195645699e-05, "loss": 0.0035, "num_tokens": 9706707.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.22744881018262314, "frac_reward_zero_std": 1.0, "grad_norm": 0.3203125, "kl": 0.09546113573014736, "learning_rate": 1.9031528873499067e-05, "loss": 0.0038, "num_tokens": 9712267.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 253.25, "completions/mean_terminated_length": 253.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.2276332779929902, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.06285127345472574, "learning_rate": 1.9028762044457994e-05, "loss": 0.0025, "num_tokens": 9718837.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.2278177458033573, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.09231393272057176, "learning_rate": 1.902599147048139e-05, "loss": 0.0037, "num_tokens": 9726702.0, "reward": 1.73046875, "reward_std": 0.37451139092445374, "rewards/fixed_code_pass_all_test_reward/mean": 0.73046875, "rewards/fixed_code_pass_all_test_reward/std": 0.37451139092445374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 408.75, "completions/mean_terminated_length": 408.75, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.2280022136137244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.04771248484030366, "learning_rate": 1.902321715271843e-05, "loss": 0.0019, "num_tokens": 9734284.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2281866814240915, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.06599913537502289, "learning_rate": 1.902043909231984e-05, "loss": 0.0026, "num_tokens": 9743104.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2283711492344586, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.0844385395757854, "learning_rate": 1.9017657290437902e-05, "loss": 0.0034, "num_tokens": 9748554.0, "reward": 1.0860215425491333, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.08602150529623032, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.2285556170448257, "frac_reward_zero_std": 1.0, "grad_norm": 0.81640625, "kl": 0.14957738481462002, "learning_rate": 1.9014871748226446e-05, "loss": 0.006, "num_tokens": 9756840.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 277.625, "completions/mean_terminated_length": 277.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.22874008485519276, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.06142194289714098, "learning_rate": 1.9012082466840855e-05, "loss": 0.0025, "num_tokens": 9766477.0, "reward": 1.9261363744735718, "reward_std": 0.18749041855335236, "rewards/fixed_code_pass_all_test_reward/mean": 0.9261363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.18749044835567474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 508.375, "completions/mean_terminated_length": 508.375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.22892455266555986, "frac_reward_zero_std": 1.0, "grad_norm": 0.05029296875, "kl": 0.028937112307175994, "learning_rate": 1.900928944743806e-05, "loss": 0.0012, "num_tokens": 9775576.0, "reward": 1.923076868057251, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9230769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.22910902047592696, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.08646658388897777, "learning_rate": 1.900649269117655e-05, "loss": 0.0035, "num_tokens": 9783673.0, "reward": 1.625, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 131.625, "completions/mean_terminated_length": 131.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.22929348828629403, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.0718552921898663, "learning_rate": 1.9003692199216356e-05, "loss": 0.0029, "num_tokens": 9787390.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.22947795609666113, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.10696170618757606, "learning_rate": 1.900088797271906e-05, "loss": 0.0043, "num_tokens": 9793141.0, "reward": 1.6666667461395264, "reward_std": 0.20573778450489044, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 471.5, "completions/mean_terminated_length": 471.5, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.22966242390702823, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.05108597734943032, "learning_rate": 1.8998080012847798e-05, "loss": 0.002, "num_tokens": 9803561.0, "reward": 0.7583333253860474, "reward_std": 0.46827951073646545, "rewards/fixed_code_pass_all_test_reward/mean": 0.008333333767950535, "rewards/fixed_code_pass_all_test_reward/std": 0.015430336818099022, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.2298468917173953, "frac_reward_zero_std": 0.0, "grad_norm": 3.453125, "kl": 0.15273124165832996, "learning_rate": 1.8995268320767254e-05, "loss": 0.0061, "num_tokens": 9810548.0, "reward": 1.5333333015441895, "reward_std": 0.1885617971420288, "rewards/fixed_code_pass_all_test_reward/mean": 0.5333333611488342, "rewards/fixed_code_pass_all_test_reward/std": 0.18856181204319, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2300313595277624, "frac_reward_zero_std": 1.0, "grad_norm": 0.5390625, "kl": 0.12298192782327533, "learning_rate": 1.8992452897643647e-05, "loss": 0.0049, "num_tokens": 9821840.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.2302158273381295, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.06851087743416429, "learning_rate": 1.8989633744644762e-05, "loss": 0.0027, "num_tokens": 9826357.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.23040029514849658, "frac_reward_zero_std": 1.0, "grad_norm": 0.294921875, "kl": 0.08486226899549365, "learning_rate": 1.898681086293992e-05, "loss": 0.0034, "num_tokens": 9830551.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.23058476295886368, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.027815771580208093, "learning_rate": 1.8983984253699994e-05, "loss": 0.0011, "num_tokens": 9836111.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.23076923076923078, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.09559668693691492, "learning_rate": 1.8981153918097397e-05, "loss": 0.0038, "num_tokens": 9842363.0, "reward": 1.0489130020141602, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.17391304671764374, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.23095369857959785, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "kl": 0.12738389568403363, "learning_rate": 1.8978319857306094e-05, "loss": 0.0051, "num_tokens": 9850743.0, "reward": 1.8571429252624512, "reward_std": 0.2645200192928314, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.2645200192928314, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 418.75, "completions/mean_terminated_length": 418.75, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.23113816638996496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.02734701451845467, "learning_rate": 1.897548207250159e-05, "loss": 0.0011, "num_tokens": 9858405.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 327.0, "completions/mean_terminated_length": 327.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.23132263420033206, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.0765072163194418, "learning_rate": 1.8972640564860937e-05, "loss": 0.0031, "num_tokens": 9868333.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 412.125, "completions/mean_terminated_length": 412.125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.23150710201069913, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.049343378050252795, "learning_rate": 1.896979533556273e-05, "loss": 0.002, "num_tokens": 9876198.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.23169156982106623, "frac_reward_zero_std": 1.0, "grad_norm": 0.162109375, "kl": 0.10531375952996314, "learning_rate": 1.8966946385787114e-05, "loss": 0.0042, "num_tokens": 9884302.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.2318760376314333, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.07673428952693939, "learning_rate": 1.8964093716715767e-05, "loss": 0.0031, "num_tokens": 9891946.0, "reward": 1.7166666984558105, "reward_std": 0.2884000539779663, "rewards/fixed_code_pass_all_test_reward/mean": 0.7166666984558105, "rewards/fixed_code_pass_all_test_reward/std": 0.2884000837802887, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 237.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.2320605054418004, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.069320117123425, "learning_rate": 1.8961237329531912e-05, "loss": 0.0028, "num_tokens": 9900189.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 210.75, "completions/mean_terminated_length": 210.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.2322449732521675, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.09632832813076675, "learning_rate": 1.8958377225420324e-05, "loss": 0.0039, "num_tokens": 9907363.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.23242944106253458, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.09265866130590439, "learning_rate": 1.8955513405567307e-05, "loss": 0.0037, "num_tokens": 9914830.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.23261390887290168, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.11927621625363827, "learning_rate": 1.8952645871160713e-05, "loss": 0.0048, "num_tokens": 9920346.0, "reward": 1.703125, "reward_std": 0.41153839230537415, "rewards/fixed_code_pass_all_test_reward/mean": 0.703125, "rewards/fixed_code_pass_all_test_reward/std": 0.41153839230537415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.23279837668326878, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.11679213307797909, "learning_rate": 1.8949774623389935e-05, "loss": 0.0047, "num_tokens": 9929242.0, "reward": 1.0347222089767456, "reward_std": 0.30994322896003723, "rewards/fixed_code_pass_all_test_reward/mean": 0.1597222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.1437636762857437, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.23298284449363585, "frac_reward_zero_std": 0.0, "grad_norm": 4.25, "kl": 0.12221890594810247, "learning_rate": 1.8946899663445902e-05, "loss": 0.0049, "num_tokens": 9933178.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 533.25, "completions/mean_terminated_length": 533.25, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.23316731230400295, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.03914674534462392, "learning_rate": 1.894402099252109e-05, "loss": 0.0016, "num_tokens": 9948132.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 217.625, "completions/mean_terminated_length": 217.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.23335178011437005, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.11165118124336004, "learning_rate": 1.8941138611809504e-05, "loss": 0.0045, "num_tokens": 9954097.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.23353624792473712, "frac_reward_zero_std": 1.0, "grad_norm": 0.212890625, "kl": 0.12788341101258993, "learning_rate": 1.8938252522506693e-05, "loss": 0.0051, "num_tokens": 9960950.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 309.625, "completions/mean_terminated_length": 309.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.23372071573510422, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0950158997438848, "learning_rate": 1.8935362725809754e-05, "loss": 0.0038, "num_tokens": 9971067.0, "reward": 1.5568182468414307, "reward_std": 0.3346834182739258, "rewards/fixed_code_pass_all_test_reward/mean": 0.5568181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.3346834182739258, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.23390518354547132, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.11858467012643814, "learning_rate": 1.8932469222917297e-05, "loss": 0.0047, "num_tokens": 9977563.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2340896513558384, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.08217783598229289, "learning_rate": 1.8929572015029498e-05, "loss": 0.0033, "num_tokens": 9988101.0, "reward": 1.6397058963775635, "reward_std": 0.2799088954925537, "rewards/fixed_code_pass_all_test_reward/mean": 0.6397058963775635, "rewards/fixed_code_pass_all_test_reward/std": 0.2799088954925537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 133.875, "completions/mean_terminated_length": 133.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.2342741191662055, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.0881442055106163, "learning_rate": 1.892667110334805e-05, "loss": 0.0035, "num_tokens": 9995132.0, "reward": 1.9642857313156128, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.2344585869765726, "frac_reward_zero_std": 1.0, "grad_norm": 0.212890625, "kl": 0.1048635602928698, "learning_rate": 1.892376648907619e-05, "loss": 0.0042, "num_tokens": 10005079.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.23464305478693967, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.0899050454609096, "learning_rate": 1.892085817341869e-05, "loss": 0.0036, "num_tokens": 10010698.0, "reward": 1.486111044883728, "reward_std": 0.29057806730270386, "rewards/fixed_code_pass_all_test_reward/mean": 0.4861111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.29057809710502625, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.23482752259730677, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.058697377098724246, "learning_rate": 1.891794615758185e-05, "loss": 0.0023, "num_tokens": 10017509.0, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.23501199040767387, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.10295295249670744, "learning_rate": 1.891503044277352e-05, "loss": 0.0041, "num_tokens": 10024877.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.23519645821804094, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.15055046556517482, "learning_rate": 1.8912111030203072e-05, "loss": 0.006, "num_tokens": 10031425.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.23538092602840804, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.07557458383962512, "learning_rate": 1.8909187921081418e-05, "loss": 0.003, "num_tokens": 10037960.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 320.375, "completions/mean_terminated_length": 320.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.23556539383877514, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.08873918117024004, "learning_rate": 1.8906261116620993e-05, "loss": 0.0035, "num_tokens": 10049267.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.23574986164914222, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.09273132123053074, "learning_rate": 1.8903330618035782e-05, "loss": 0.0037, "num_tokens": 10057693.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.23593432945950932, "frac_reward_zero_std": 0.0, "grad_norm": 3.53125, "kl": 0.09048820519819856, "learning_rate": 1.8900396426541284e-05, "loss": 0.0036, "num_tokens": 10066069.0, "reward": 1.2331080436706543, "reward_std": 0.09466349333524704, "rewards/fixed_code_pass_all_test_reward/mean": 0.23310810327529907, "rewards/fixed_code_pass_all_test_reward/std": 0.09466351568698883, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 216.875, "completions/mean_terminated_length": 216.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.23611879726987642, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.07982678851112723, "learning_rate": 1.8897458543354544e-05, "loss": 0.0032, "num_tokens": 10074244.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 295.25, "completions/mean_terminated_length": 295.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.2363032650802435, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.15892321011051536, "learning_rate": 1.8894516969694133e-05, "loss": 0.0064, "num_tokens": 10082822.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.2364877328906106, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.04709341248963028, "learning_rate": 1.889157170678015e-05, "loss": 0.0019, "num_tokens": 10087847.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.2366722007009777, "frac_reward_zero_std": 0.0, "grad_norm": 5.59375, "kl": 0.1755965193733573, "learning_rate": 1.8888622755834222e-05, "loss": 0.007, "num_tokens": 10094246.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.23685666851134476, "frac_reward_zero_std": 1.0, "grad_norm": 0.2119140625, "kl": 0.100152138620615, "learning_rate": 1.888567011807952e-05, "loss": 0.004, "num_tokens": 10101643.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 357.375, "completions/mean_terminated_length": 115.85714721679688, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.23704113632171186, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.12150335591286421, "learning_rate": 1.888271379474072e-05, "loss": 0.0049, "num_tokens": 10107230.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.23722560413207897, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.07435227558016777, "learning_rate": 1.8879753787044058e-05, "loss": 0.003, "num_tokens": 10112977.0, "reward": 1.9702380895614624, "reward_std": 0.08417939394712448, "rewards/fixed_code_pass_all_test_reward/mean": 0.9702380895614624, "rewards/fixed_code_pass_all_test_reward/std": 0.08417937159538269, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 268.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.23741007194244604, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.06946290051564574, "learning_rate": 1.887679009621727e-05, "loss": 0.0028, "num_tokens": 10122422.0, "reward": 1.25, "reward_std": 0.29546844959259033, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.29546844959259033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.23759453975281314, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.17228818405419588, "learning_rate": 1.8873822723489633e-05, "loss": 0.0069, "num_tokens": 10130693.0, "reward": 1.0174999237060547, "reward_std": 0.019820604473352432, "rewards/fixed_code_pass_all_test_reward/mean": 0.017500000074505806, "rewards/fixed_code_pass_all_test_reward/std": 0.019820624962449074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.2377790075631802, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.05748336995020509, "learning_rate": 1.8870851670091954e-05, "loss": 0.0023, "num_tokens": 10138220.0, "reward": 1.308333396911621, "reward_std": 0.07071070373058319, "rewards/fixed_code_pass_all_test_reward/mean": 0.3083333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 174.5, "completions/mean_terminated_length": 174.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2379634753735473, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.15306519623845816, "learning_rate": 1.8867876937256556e-05, "loss": 0.0061, "num_tokens": 10146760.0, "reward": 1.3125, "reward_std": 0.1292274445295334, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.12922745943069458, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2381479431839144, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.10634991666302085, "learning_rate": 1.8864898526217295e-05, "loss": 0.0043, "num_tokens": 10157211.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.23833241099428149, "frac_reward_zero_std": 0.0, "grad_norm": 3.421875, "kl": 0.13709397241473198, "learning_rate": 1.8861916438209555e-05, "loss": 0.0055, "num_tokens": 10165170.0, "reward": 1.1666667461395264, "reward_std": 0.12848319113254547, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.12848322093486786, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 309.625, "completions/mean_terminated_length": 309.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.23851687880464859, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.12793859746307135, "learning_rate": 1.885893067447024e-05, "loss": 0.0051, "num_tokens": 10175495.0, "reward": 1.3055555820465088, "reward_std": 0.4417078495025635, "rewards/fixed_code_pass_all_test_reward/mean": 0.3055555522441864, "rewards/fixed_code_pass_all_test_reward/std": 0.4417078495025635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.23870134661501569, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.16298149386420846, "learning_rate": 1.8855941236237776e-05, "loss": 0.0065, "num_tokens": 10180502.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.23888581442538276, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.04078978672623634, "learning_rate": 1.8852948124752125e-05, "loss": 0.0016, "num_tokens": 10187619.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.23907028223574986, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.09844178054481745, "learning_rate": 1.8849951341254758e-05, "loss": 0.0039, "num_tokens": 10198349.0, "reward": 1.975000023841858, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 109.0, "completions/mean_terminated_length": 109.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.23925475004611696, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.09320723544806242, "learning_rate": 1.884695088698868e-05, "loss": 0.0037, "num_tokens": 10206317.0, "reward": 1.0364582538604736, "reward_std": 0.014731382951140404, "rewards/fixed_code_pass_all_test_reward/mean": 0.0364583358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.01473139226436615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 86.25, "completions/mean_terminated_length": 86.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.23943921785648403, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.06544944550842047, "learning_rate": 1.8843946763198414e-05, "loss": 0.0026, "num_tokens": 10209895.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 459.375, "completions/mean_terminated_length": 459.375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.23962368566685113, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.03229556797305122, "learning_rate": 1.8840938971130005e-05, "loss": 0.0013, "num_tokens": 10219050.0, "reward": 1.9500000476837158, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 347.625, "completions/mean_terminated_length": 347.625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.23980815347721823, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.03613125788979232, "learning_rate": 1.8837927512031022e-05, "loss": 0.0014, "num_tokens": 10227455.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2399926212875853, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.09378097532317042, "learning_rate": 1.883491238715055e-05, "loss": 0.0038, "num_tokens": 10232718.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.2401770890979524, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.08148466423153877, "learning_rate": 1.8831893597739204e-05, "loss": 0.0033, "num_tokens": 10238011.0, "reward": 1.795454502105713, "reward_std": 0.2822995185852051, "rewards/fixed_code_pass_all_test_reward/mean": 0.7954545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.28229954838752747, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 405.625, "completions/mean_terminated_length": 405.625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.2403615569083195, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.07149765361100435, "learning_rate": 1.8828871145049103e-05, "loss": 0.0029, "num_tokens": 10250824.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.24054602471868658, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.15722572058439255, "learning_rate": 1.8825845030333905e-05, "loss": 0.0063, "num_tokens": 10260940.0, "reward": 1.6527777910232544, "reward_std": 0.4840944707393646, "rewards/fixed_code_pass_all_test_reward/mean": 0.6527777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.4840944707393646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.24073049252905368, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.07912445813417435, "learning_rate": 1.882281525484877e-05, "loss": 0.0032, "num_tokens": 10271796.0, "reward": 1.8794642686843872, "reward_std": 0.19965523481369019, "rewards/fixed_code_pass_all_test_reward/mean": 0.8794642686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.19965526461601257, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.24091496033942078, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.10239686304703355, "learning_rate": 1.8819781819850383e-05, "loss": 0.0041, "num_tokens": 10281334.0, "reward": 1.4836957454681396, "reward_std": 0.1977970451116562, "rewards/fixed_code_pass_all_test_reward/mean": 0.4836956262588501, "rewards/fixed_code_pass_all_test_reward/std": 0.19779707491397858, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.24109942814978785, "frac_reward_zero_std": 0.0, "grad_norm": 4.75, "kl": 0.16629721247591078, "learning_rate": 1.8816744726596957e-05, "loss": 0.0067, "num_tokens": 10288161.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 229.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.24128389596015495, "frac_reward_zero_std": 1.0, "grad_norm": 0.1669921875, "kl": 0.0852769766934216, "learning_rate": 1.88137039763482e-05, "loss": 0.0034, "num_tokens": 10298137.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.24146836377052205, "frac_reward_zero_std": 1.0, "grad_norm": 0.1826171875, "kl": 0.12459582835435867, "learning_rate": 1.881065957036536e-05, "loss": 0.005, "num_tokens": 10306644.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.24165283158088913, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.07154052355326712, "learning_rate": 1.880761150991118e-05, "loss": 0.0029, "num_tokens": 10312823.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.24183729939125623, "frac_reward_zero_std": 1.0, "grad_norm": 0.2138671875, "kl": 0.0701401368714869, "learning_rate": 1.880455979624994e-05, "loss": 0.0028, "num_tokens": 10317139.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 118.5, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.24202176720162333, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.08863711589947343, "learning_rate": 1.8801504430647424e-05, "loss": 0.0035, "num_tokens": 10322783.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2422062350119904, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.0938778929412365, "learning_rate": 1.8798445414370926e-05, "loss": 0.0038, "num_tokens": 10330434.0, "reward": 1.8828125, "reward_std": 0.1446593850851059, "rewards/fixed_code_pass_all_test_reward/mean": 0.8828125, "rewards/fixed_code_pass_all_test_reward/std": 0.1446593850851059, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.2423907028223575, "frac_reward_zero_std": 1.0, "grad_norm": 0.2890625, "kl": 0.04951098491437733, "learning_rate": 1.879538274868926e-05, "loss": 0.002, "num_tokens": 10336012.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.2425751706327246, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.1090414603240788, "learning_rate": 1.879231643487276e-05, "loss": 0.0044, "num_tokens": 10340158.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 155.625, "completions/mean_terminated_length": 155.625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.24275963844309167, "frac_reward_zero_std": 1.0, "grad_norm": 0.32421875, "kl": 0.09308837680146098, "learning_rate": 1.8789246474193258e-05, "loss": 0.0037, "num_tokens": 10344323.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 133.0, "completions/mean_terminated_length": 133.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.24294410625345877, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.0670363565441221, "learning_rate": 1.8786172867924114e-05, "loss": 0.0027, "num_tokens": 10348363.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.24312857406382588, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.0961556644178927, "learning_rate": 1.8783095617340193e-05, "loss": 0.0038, "num_tokens": 10358774.0, "reward": 1.932692289352417, "reward_std": 0.1903749406337738, "rewards/fixed_code_pass_all_test_reward/mean": 0.932692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.19037489593029022, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.24331304187419295, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "kl": 0.1197963273152709, "learning_rate": 1.878001472371787e-05, "loss": 0.0048, "num_tokens": 10367088.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.24349750968456005, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.06442373129539192, "learning_rate": 1.8776930188335037e-05, "loss": 0.0026, "num_tokens": 10375431.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 91.875, "completions/mean_terminated_length": 91.875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.24368197749492715, "frac_reward_zero_std": 1.0, "grad_norm": 0.1611328125, "kl": 0.09991543320938945, "learning_rate": 1.8773842012471088e-05, "loss": 0.004, "num_tokens": 10378878.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.24386644530529422, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.07509567216038704, "learning_rate": 1.877075019740693e-05, "loss": 0.003, "num_tokens": 10384168.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.24405091311566132, "frac_reward_zero_std": 0.0, "grad_norm": 3.296875, "kl": 0.0795449239667505, "learning_rate": 1.8767654744424986e-05, "loss": 0.0032, "num_tokens": 10388280.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 151.5, "completions/mean_terminated_length": 151.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2442353809260284, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.08028299640864134, "learning_rate": 1.876455565480918e-05, "loss": 0.0032, "num_tokens": 10392452.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 265.875, "completions/mean_terminated_length": 265.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.2444198487363955, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.056012172950431705, "learning_rate": 1.8761452929844955e-05, "loss": 0.0022, "num_tokens": 10398107.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 251.875, "completions/mean_terminated_length": 251.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.2446043165467626, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.05728443246334791, "learning_rate": 1.8758346570819242e-05, "loss": 0.0023, "num_tokens": 10407002.0, "reward": 1.7471264600753784, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7471264600753784, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 433.25, "completions/mean_terminated_length": 433.25, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.24478878435712967, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.053921584505587816, "learning_rate": 1.8755236579020503e-05, "loss": 0.0022, "num_tokens": 10418932.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 361.125, "completions/mean_terminated_length": 361.125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.24497325216749677, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.044229753548279405, "learning_rate": 1.8752122955738686e-05, "loss": 0.0018, "num_tokens": 10425845.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.24515771997786387, "frac_reward_zero_std": 1.0, "grad_norm": 1.3515625, "kl": 0.1659008590504527, "learning_rate": 1.874900570226526e-05, "loss": 0.0066, "num_tokens": 10430208.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 347.375, "completions/mean_terminated_length": 347.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.24534218778823094, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.08580513019114733, "learning_rate": 1.8745884819893194e-05, "loss": 0.0034, "num_tokens": 10437659.0, "reward": 1.7211538553237915, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.24552665559859804, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.08520369743928313, "learning_rate": 1.8742760309916962e-05, "loss": 0.0034, "num_tokens": 10445990.0, "reward": 1.7788461446762085, "reward_std": 0.40949738025665283, "rewards/fixed_code_pass_all_test_reward/mean": 0.7788461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.40949735045433044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.24571112340896514, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.07034035306423903, "learning_rate": 1.8739632173632545e-05, "loss": 0.0028, "num_tokens": 10457441.0, "reward": 1.797619104385376, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7976190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.24589559121933222, "frac_reward_zero_std": 1.0, "grad_norm": 0.2216796875, "kl": 0.14210033090785146, "learning_rate": 1.8736500412337424e-05, "loss": 0.0057, "num_tokens": 10464180.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.24608005902969932, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.06701172143220901, "learning_rate": 1.8733365027330585e-05, "loss": 0.0027, "num_tokens": 10472972.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 368.5, "completions/mean_terminated_length": 368.5, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.24626452684006642, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.07278338447213173, "learning_rate": 1.8730226019912518e-05, "loss": 0.0029, "num_tokens": 10481256.0, "reward": 1.4208333492279053, "reward_std": 0.12076142430305481, "rewards/fixed_code_pass_all_test_reward/mean": 0.4208333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.1207614615559578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.2464489946504335, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.1849921392276883, "learning_rate": 1.872708339138522e-05, "loss": 0.0074, "num_tokens": 10485464.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 312.625, "completions/mean_terminated_length": 312.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.2466334624608006, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.04131636652164161, "learning_rate": 1.8723937143052186e-05, "loss": 0.0017, "num_tokens": 10492101.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.2468179302711677, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.0629950372967869, "learning_rate": 1.8720787276218403e-05, "loss": 0.0025, "num_tokens": 10496341.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 234.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.24700239808153476, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.05945300543680787, "learning_rate": 1.871763379219037e-05, "loss": 0.0024, "num_tokens": 10501040.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 300.375, "completions/mean_terminated_length": 300.375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.24718686589190186, "frac_reward_zero_std": 1.0, "grad_norm": 0.158203125, "kl": 0.0679636166896671, "learning_rate": 1.8714476692276097e-05, "loss": 0.0027, "num_tokens": 10511835.0, "reward": 1.08695650100708, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.08695652335882187, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.24737133370226896, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.08398942044004798, "learning_rate": 1.8711315977785065e-05, "loss": 0.0034, "num_tokens": 10516231.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 205.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.24755580151263604, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.07121687708422542, "learning_rate": 1.870815165002828e-05, "loss": 0.0028, "num_tokens": 10520648.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 440.25, "completions/mean_terminated_length": 440.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.24774026932300314, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.05901872110553086, "learning_rate": 1.870498371031823e-05, "loss": 0.0024, "num_tokens": 10529242.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.24792473713337024, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.07960994192399085, "learning_rate": 1.870181215996891e-05, "loss": 0.0032, "num_tokens": 10537642.0, "reward": 1.60869562625885, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6086956262588501, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2481092049437373, "frac_reward_zero_std": 1.0, "grad_norm": 0.26171875, "kl": 0.07687115529552102, "learning_rate": 1.8698637000295816e-05, "loss": 0.0031, "num_tokens": 10543418.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.2482936727541044, "frac_reward_zero_std": 1.0, "grad_norm": 0.361328125, "kl": 0.11647214740514755, "learning_rate": 1.8695458232615928e-05, "loss": 0.0047, "num_tokens": 10552507.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.2484781405644715, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.041788886999711394, "learning_rate": 1.8692275858247737e-05, "loss": 0.0017, "num_tokens": 10559877.0, "reward": 1.6889533996582031, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.8139534592628479, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.24866260837483858, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.08857871126383543, "learning_rate": 1.8689089878511216e-05, "loss": 0.0035, "num_tokens": 10568986.0, "reward": 1.9208860397338867, "reward_std": 0.22376799583435059, "rewards/fixed_code_pass_all_test_reward/mean": 0.9208860397338867, "rewards/fixed_code_pass_all_test_reward/std": 0.2237679809331894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 177.375, "completions/mean_terminated_length": 177.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.24884707618520568, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.09477173257619143, "learning_rate": 1.8685900294727845e-05, "loss": 0.0038, "num_tokens": 10575957.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 135.875, "completions/mean_terminated_length": 135.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.24903154399557278, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.07237973273731768, "learning_rate": 1.8682707108220594e-05, "loss": 0.0029, "num_tokens": 10584068.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 105.375, "completions/mean_terminated_length": 105.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.24921601180593986, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.08639946719631553, "learning_rate": 1.867951032031393e-05, "loss": 0.0035, "num_tokens": 10587695.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.24940047961630696, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.0656609337311238, "learning_rate": 1.8676309932333807e-05, "loss": 0.0026, "num_tokens": 10595684.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 553.25, "completions/mean_terminated_length": 553.25, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.24958494742667406, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.02917829854413867, "learning_rate": 1.867310594560768e-05, "loss": 0.0012, "num_tokens": 10612638.0, "reward": 1.952380895614624, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9523809552192688, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 158.75, "completions/mean_terminated_length": 158.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.24976941523704113, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.10601191595196724, "learning_rate": 1.866989836146449e-05, "loss": 0.0042, "num_tokens": 10620508.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 254.0, "completions/mean_terminated_length": 254.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.24995388304740823, "frac_reward_zero_std": 1.0, "grad_norm": 0.37109375, "kl": 0.08250629529356956, "learning_rate": 1.866668718123468e-05, "loss": 0.0033, "num_tokens": 10631492.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.2501383508577753, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.039928304962813854, "learning_rate": 1.8663472406250173e-05, "loss": 0.0016, "num_tokens": 10640254.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 326.375, "completions/mean_terminated_length": 326.375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.25032281866814243, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.04046353348530829, "learning_rate": 1.866025403784439e-05, "loss": 0.0016, "num_tokens": 10652041.0, "reward": 1.9722222089767456, "reward_std": 0.07856743782758713, "rewards/fixed_code_pass_all_test_reward/mean": 0.9722222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.07856741547584534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 264.125, "completions/mean_terminated_length": 264.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.2505072864785095, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0604249001480639, "learning_rate": 1.8657032077352237e-05, "loss": 0.0024, "num_tokens": 10658770.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2506917542888766, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.04562810156494379, "learning_rate": 1.865380652611012e-05, "loss": 0.0018, "num_tokens": 10663865.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2508762220992437, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.07896366342902184, "learning_rate": 1.8650577385455927e-05, "loss": 0.0032, "num_tokens": 10672263.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.2510606899096108, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.09797381469979882, "learning_rate": 1.864734465672903e-05, "loss": 0.0039, "num_tokens": 10676854.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 369.25, "completions/mean_terminated_length": 369.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.25124515771997785, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.033370476332493126, "learning_rate": 1.8644108341270298e-05, "loss": 0.0013, "num_tokens": 10685240.0, "reward": 1.5, "reward_std": 0.4140393137931824, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.41403937339782715, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.251429625530345, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0870793960057199, "learning_rate": 1.864086844042209e-05, "loss": 0.0035, "num_tokens": 10694062.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.25161409334071205, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.0737880696542561, "learning_rate": 1.863762495552824e-05, "loss": 0.003, "num_tokens": 10702677.0, "reward": 0.4763513207435608, "reward_std": 0.6574273109436035, "rewards/fixed_code_pass_all_test_reward/mean": 0.10135135054588318, "rewards/fixed_code_pass_all_test_reward/std": 0.13987815380096436, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 217.625, "completions/mean_terminated_length": 217.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.2517985611510791, "frac_reward_zero_std": 0.0, "grad_norm": 3.53125, "kl": 0.11603337852284312, "learning_rate": 1.8634377887934077e-05, "loss": 0.0046, "num_tokens": 10707962.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.25198302896144625, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.050316498847678304, "learning_rate": 1.8631127238986418e-05, "loss": 0.002, "num_tokens": 10716291.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 140.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2521674967718133, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.1012763031758368, "learning_rate": 1.862787301003356e-05, "loss": 0.0041, "num_tokens": 10720146.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 325.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.2523519645821804, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.046854516956955194, "learning_rate": 1.8624615202425284e-05, "loss": 0.0019, "num_tokens": 10730102.0, "reward": 1.1085526943206787, "reward_std": 0.0723257064819336, "rewards/fixed_code_pass_all_test_reward/mean": 0.10855262726545334, "rewards/fixed_code_pass_all_test_reward/std": 0.0723256915807724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.2525364323925475, "frac_reward_zero_std": 1.0, "grad_norm": 0.484375, "kl": 0.08764395327307284, "learning_rate": 1.8621353817512862e-05, "loss": 0.0035, "num_tokens": 10734232.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 254.625, "completions/mean_terminated_length": 254.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2527209002029146, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.07904953695833683, "learning_rate": 1.8618088856649046e-05, "loss": 0.0032, "num_tokens": 10740325.0, "reward": 1.7037036418914795, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7037037014961243, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.2529053680132817, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.11030492396093905, "learning_rate": 1.8614820321188068e-05, "loss": 0.0044, "num_tokens": 10748928.0, "reward": 1.133474588394165, "reward_std": 0.026316048577427864, "rewards/fixed_code_pass_all_test_reward/mean": 0.13347457349300385, "rewards/fixed_code_pass_all_test_reward/std": 0.02631605975329876, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.25308983582364875, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.07812581374309957, "learning_rate": 1.861154821248565e-05, "loss": 0.0031, "num_tokens": 10759062.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 361.0, "completions/mean_terminated_length": 361.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.2532743036340159, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.03817011398496106, "learning_rate": 1.8608272531898984e-05, "loss": 0.0015, "num_tokens": 10765686.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.25345877144438295, "frac_reward_zero_std": 1.0, "grad_norm": 0.30078125, "kl": 0.1152905598282814, "learning_rate": 1.8604993280786766e-05, "loss": 0.0046, "num_tokens": 10769999.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 324.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.25364323925475, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.06957451580092311, "learning_rate": 1.8601710460509146e-05, "loss": 0.0028, "num_tokens": 10775862.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 259.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.25382770706511715, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.08190432051196694, "learning_rate": 1.859842407242777e-05, "loss": 0.0033, "num_tokens": 10783714.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 288.0, "completions/mean_terminated_length": 288.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2540121748754842, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.10825658868998289, "learning_rate": 1.8595134117905764e-05, "loss": 0.0043, "num_tokens": 10788786.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 535.875, "completions/mean_terminated_length": 535.875, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.2541966426858513, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.03133757412433624, "learning_rate": 1.8591840598307726e-05, "loss": 0.0013, "num_tokens": 10799713.0, "reward": 1.9506173133850098, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9506173133850098, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2543811104962184, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.11443915963172913, "learning_rate": 1.8588543514999743e-05, "loss": 0.0046, "num_tokens": 10804618.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 391.5, "completions/mean_terminated_length": 391.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2545655783065855, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.04949003050569445, "learning_rate": 1.8585242869349366e-05, "loss": 0.002, "num_tokens": 10812598.0, "reward": 1.9241070747375488, "reward_std": 0.21465744078159332, "rewards/fixed_code_pass_all_test_reward/mean": 0.9241071343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.21465741097927094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 473.25, "completions/mean_terminated_length": 473.25, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.25475004611695257, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.05645648296922445, "learning_rate": 1.8581938662725635e-05, "loss": 0.0023, "num_tokens": 10825784.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2549345139273197, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.057725150138139725, "learning_rate": 1.8578630896499062e-05, "loss": 0.0023, "num_tokens": 10830206.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.25511898173768677, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.15709770703688264, "learning_rate": 1.8575319572041644e-05, "loss": 0.0063, "num_tokens": 10835402.0, "reward": 1.2887930870056152, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.4137931168079376, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 342.125, "completions/mean_terminated_length": 342.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.25530344954805384, "frac_reward_zero_std": 1.0, "grad_norm": 0.232421875, "kl": 0.0730627840384841, "learning_rate": 1.8572004690726836e-05, "loss": 0.0029, "num_tokens": 10845827.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 341.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.25548791735842097, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.07896123290993273, "learning_rate": 1.856868625392959e-05, "loss": 0.0032, "num_tokens": 10855476.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.25567238516878804, "frac_reward_zero_std": 0.0, "grad_norm": 6.0, "kl": 0.1497931368649006, "learning_rate": 1.8565364263026314e-05, "loss": 0.006, "num_tokens": 10864730.0, "reward": 0.7529761791229248, "reward_std": 0.8874039649963379, "rewards/fixed_code_pass_all_test_reward/mean": 0.2529761791229248, "rewards/fixed_code_pass_all_test_reward/std": 0.4611462950706482, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2558568529791551, "frac_reward_zero_std": 1.0, "grad_norm": 0.197265625, "kl": 0.10588618973270059, "learning_rate": 1.8562038719394904e-05, "loss": 0.0042, "num_tokens": 10869705.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 347.375, "completions/mean_terminated_length": 347.375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.25604132078952224, "frac_reward_zero_std": 1.0, "grad_norm": 0.05712890625, "kl": 0.03923724475316703, "learning_rate": 1.855870962441472e-05, "loss": 0.0016, "num_tokens": 10876828.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 479.875, "completions/mean_terminated_length": 479.875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.2562257885998893, "frac_reward_zero_std": 1.0, "grad_norm": 0.0546875, "kl": 0.029928817646577954, "learning_rate": 1.8555376979466596e-05, "loss": 0.0012, "num_tokens": 10886739.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 237.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.2564102564102564, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.05060851527377963, "learning_rate": 1.8552040785932846e-05, "loss": 0.002, "num_tokens": 10893130.0, "reward": 1.7142857313156128, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 377.25, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.2565947242206235, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.09243745636194944, "learning_rate": 1.854870104519725e-05, "loss": 0.0037, "num_tokens": 10900524.0, "reward": 1.0535714626312256, "reward_std": 0.7361451387405396, "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.2645200490951538, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.2567791920309906, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.0507191838696599, "learning_rate": 1.854535775864506e-05, "loss": 0.002, "num_tokens": 10905293.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.25696365984135766, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.07957502454519272, "learning_rate": 1.8542010927663e-05, "loss": 0.0032, "num_tokens": 10911434.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.2571481276517248, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.10212097247131169, "learning_rate": 1.853866055363926e-05, "loss": 0.0041, "num_tokens": 10918979.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.25733259546209186, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.07974706683307886, "learning_rate": 1.8535306637963506e-05, "loss": 0.0032, "num_tokens": 10927203.0, "reward": 1.4419643878936768, "reward_std": 0.27922311425209045, "rewards/fixed_code_pass_all_test_reward/mean": 0.4419642686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.27922314405441284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.25751706327245893, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.08198631042614579, "learning_rate": 1.8531949182026867e-05, "loss": 0.0033, "num_tokens": 10932531.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.25770153108282606, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.0979921636171639, "learning_rate": 1.8528588187221947e-05, "loss": 0.0039, "num_tokens": 10939986.0, "reward": 1.9598214626312256, "reward_std": 0.024798767641186714, "rewards/fixed_code_pass_all_test_reward/mean": 0.9598214626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.024798741564154625, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.25788599889319314, "frac_reward_zero_std": 1.0, "grad_norm": 0.27734375, "kl": 0.10256267711520195, "learning_rate": 1.8525223654942812e-05, "loss": 0.0041, "num_tokens": 10944852.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2580704667035602, "frac_reward_zero_std": 1.0, "grad_norm": 0.5078125, "kl": 0.10553994006477296, "learning_rate": 1.8521855586584997e-05, "loss": 0.0042, "num_tokens": 10950757.0, "reward": 1.7777777910232544, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 423.5, "completions/mean_terminated_length": 191.42857360839844, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.25825493451392734, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.07546366704627872, "learning_rate": 1.8518483983545503e-05, "loss": 0.003, "num_tokens": 10957081.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 398.125, "completions/mean_terminated_length": 398.125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.2584394023242944, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.03860169253312051, "learning_rate": 1.85151088472228e-05, "loss": 0.0015, "num_tokens": 10964650.0, "reward": 1.3409090042114258, "reward_std": 0.06428244709968567, "rewards/fixed_code_pass_all_test_reward/mean": 0.34090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2586238701346615, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.08227922953665257, "learning_rate": 1.851173017901682e-05, "loss": 0.0033, "num_tokens": 10974167.0, "reward": 1.94021737575531, "reward_std": 0.16909077763557434, "rewards/fixed_code_pass_all_test_reward/mean": 0.9402173757553101, "rewards/fixed_code_pass_all_test_reward/std": 0.16909076273441315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 218.375, "completions/mean_terminated_length": 218.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2588083379450286, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.11887378757819533, "learning_rate": 1.8508347980328966e-05, "loss": 0.0048, "num_tokens": 10982954.0, "reward": 0.9270833134651184, "reward_std": 0.38114094734191895, "rewards/fixed_code_pass_all_test_reward/mean": 0.0520833358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.07339929789304733, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 182.75, "completions/mean_terminated_length": 182.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.2589928057553957, "frac_reward_zero_std": 1.0, "grad_norm": 0.439453125, "kl": 0.10942733800038695, "learning_rate": 1.85049622525621e-05, "loss": 0.0044, "num_tokens": 10991376.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 381.125, "completions/mean_terminated_length": 381.125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.25917727356576276, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.06772355642169714, "learning_rate": 1.8501572997120544e-05, "loss": 0.0027, "num_tokens": 10999497.0, "reward": 1.634615421295166, "reward_std": 0.238868847489357, "rewards/fixed_code_pass_all_test_reward/mean": 0.634615421295166, "rewards/fixed_code_pass_all_test_reward/std": 0.238868847489357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2593617413761299, "frac_reward_zero_std": 0.0, "grad_norm": 5.78125, "kl": 0.18386183865368366, "learning_rate": 1.8498180215410088e-05, "loss": 0.0074, "num_tokens": 11003543.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 433.5, "completions/mean_terminated_length": 433.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.25954620918649696, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.07063988805748522, "learning_rate": 1.849478390883799e-05, "loss": 0.0028, "num_tokens": 11014003.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.25973067699686403, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0625027040950954, "learning_rate": 1.849138407881296e-05, "loss": 0.0025, "num_tokens": 11018785.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 333.75, "completions/mean_terminated_length": 333.75, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.25991514480723116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.04641512222588062, "learning_rate": 1.8487980726745173e-05, "loss": 0.0019, "num_tokens": 11028495.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 304.625, "completions/mean_terminated_length": 304.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.26009961261759823, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.09225375205278397, "learning_rate": 1.8484573854046274e-05, "loss": 0.0037, "num_tokens": 11033788.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 336.875, "completions/mean_terminated_length": 336.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.2602840804279653, "frac_reward_zero_std": 1.0, "grad_norm": 0.2109375, "kl": 0.06420034822076559, "learning_rate": 1.8481163462129352e-05, "loss": 0.0026, "num_tokens": 11040467.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.26046854823833243, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.055732487700879574, "learning_rate": 1.847774955240897e-05, "loss": 0.0022, "num_tokens": 11046377.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 342.5, "completions/mean_terminated_length": 342.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2606530160486995, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.09523147670552135, "learning_rate": 1.8474332126301138e-05, "loss": 0.0038, "num_tokens": 11051909.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2608374838590666, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.07202458241954446, "learning_rate": 1.8470911185223334e-05, "loss": 0.0029, "num_tokens": 11060484.0, "reward": 1.454545497894287, "reward_std": 0.3366618752479553, "rewards/fixed_code_pass_all_test_reward/mean": 0.45454543828964233, "rewards/fixed_code_pass_all_test_reward/std": 0.33666184544563293, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.2610219516694337, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0761791376862675, "learning_rate": 1.846748673059449e-05, "loss": 0.003, "num_tokens": 11066240.0, "reward": 1.5595238208770752, "reward_std": 0.4571003317832947, "rewards/fixed_code_pass_all_test_reward/mean": 0.6845238208770752, "rewards/fixed_code_pass_all_test_reward/std": 0.2117193192243576, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 230.375, "completions/mean_terminated_length": 230.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2612064194798008, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.11275411304086447, "learning_rate": 1.8464058763834996e-05, "loss": 0.0045, "num_tokens": 11074947.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 468.875, "completions/mean_terminated_length": 468.875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.26139088729016785, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.061733261216431856, "learning_rate": 1.84606272863667e-05, "loss": 0.0025, "num_tokens": 11083946.0, "reward": 1.4134615659713745, "reward_std": 0.25325438380241394, "rewards/fixed_code_pass_all_test_reward/mean": 0.4134615361690521, "rewards/fixed_code_pass_all_test_reward/std": 0.25325441360473633, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 345.125, "completions/mean_terminated_length": 345.125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.261575355100535, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.04770751670002937, "learning_rate": 1.84571922996129e-05, "loss": 0.0019, "num_tokens": 11090779.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 175.625, "completions/mean_terminated_length": 175.625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.26175982291090205, "frac_reward_zero_std": 0.0, "grad_norm": 3.28125, "kl": 0.11501331720501184, "learning_rate": 1.845375380499836e-05, "loss": 0.0046, "num_tokens": 11096024.0, "reward": 1.6296296119689941, "reward_std": 0.41550397872924805, "rewards/fixed_code_pass_all_test_reward/mean": 0.6296296119689941, "rewards/fixed_code_pass_all_test_reward/std": 0.41550394892692566, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 417.625, "completions/mean_terminated_length": 417.625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.2619442907212691, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.08361255517229438, "learning_rate": 1.845031180394929e-05, "loss": 0.0033, "num_tokens": 11106613.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.26212875853163625, "frac_reward_zero_std": 1.0, "grad_norm": 0.275390625, "kl": 0.12820815201848745, "learning_rate": 1.8446866297893358e-05, "loss": 0.0051, "num_tokens": 11114266.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.2623132263420033, "frac_reward_zero_std": 1.0, "grad_norm": 0.357421875, "kl": 0.13936164695769548, "learning_rate": 1.8443417288259688e-05, "loss": 0.0056, "num_tokens": 11118396.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2624976941523704, "frac_reward_zero_std": 1.0, "grad_norm": 0.255859375, "kl": 0.11911450605839491, "learning_rate": 1.843996477647885e-05, "loss": 0.0048, "num_tokens": 11122509.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.2626821619627375, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.08378063794225454, "learning_rate": 1.8436508763982875e-05, "loss": 0.0034, "num_tokens": 11132623.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.2628666297731046, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.07946379063650966, "learning_rate": 1.843304925220524e-05, "loss": 0.0032, "num_tokens": 11138499.0, "reward": 1.600235939025879, "reward_std": 0.4816190302371979, "rewards/fixed_code_pass_all_test_reward/mean": 0.7252358794212341, "rewards/fixed_code_pass_all_test_reward/std": 0.33164629340171814, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.26305109758347167, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.09720235038548708, "learning_rate": 1.8429586242580884e-05, "loss": 0.0039, "num_tokens": 11144704.0, "reward": 1.640625, "reward_std": 0.6629126071929932, "rewards/fixed_code_pass_all_test_reward/mean": 0.765625, "rewards/fixed_code_pass_all_test_reward/std": 0.30935922265052795, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.2632355653938388, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.10545479319989681, "learning_rate": 1.8426119736546175e-05, "loss": 0.0042, "num_tokens": 11152577.0, "reward": 1.8888888359069824, "reward_std": 0.061721283942461014, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888359069824, "rewards/fixed_code_pass_all_test_reward/std": 0.06172133982181549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.2634200332042059, "frac_reward_zero_std": 0.0, "grad_norm": 5.0, "kl": 0.15885676443576813, "learning_rate": 1.8422649735538952e-05, "loss": 0.0064, "num_tokens": 11160731.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.26360450101457295, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.06548916501924396, "learning_rate": 1.84191762409985e-05, "loss": 0.0026, "num_tokens": 11167870.0, "reward": 1.1741070747375488, "reward_std": 0.22703364491462708, "rewards/fixed_code_pass_all_test_reward/mean": 0.1741071492433548, "rewards/fixed_code_pass_all_test_reward/std": 0.22703365981578827, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.2637889688249401, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.10507871489971876, "learning_rate": 1.8415699254365544e-05, "loss": 0.0042, "num_tokens": 11176713.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.26397343663530715, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.08360732113942504, "learning_rate": 1.841221877708226e-05, "loss": 0.0033, "num_tokens": 11182469.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 384.5, "completions/mean_terminated_length": 384.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.2641579044456742, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.06041333544999361, "learning_rate": 1.8408734810592287e-05, "loss": 0.0024, "num_tokens": 11194137.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 259.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.26434237225604135, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.10521396435797215, "learning_rate": 1.840524735634069e-05, "loss": 0.0042, "num_tokens": 11200371.0, "reward": 1.3409090042114258, "reward_std": 0.46035251021385193, "rewards/fixed_code_pass_all_test_reward/mean": 0.46590906381607056, "rewards/fixed_code_pass_all_test_reward/std": 0.4894097149372101, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.2645268400664084, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.06152463098987937, "learning_rate": 1.840175641577399e-05, "loss": 0.0025, "num_tokens": 11206549.0, "reward": 1.4690860509872437, "reward_std": 0.07223133742809296, "rewards/fixed_code_pass_all_test_reward/mean": 0.46908602118492126, "rewards/fixed_code_pass_all_test_reward/std": 0.07223134487867355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 290.625, "completions/mean_terminated_length": 290.625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2647113078767755, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.14911055332049727, "learning_rate": 1.839826199034015e-05, "loss": 0.006, "num_tokens": 11213682.0, "reward": 1.234375, "reward_std": 0.3499840497970581, "rewards/fixed_code_pass_all_test_reward/mean": 0.234375, "rewards/fixed_code_pass_all_test_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.2648957756871426, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.09432162716984749, "learning_rate": 1.839476408148859e-05, "loss": 0.0038, "num_tokens": 11222635.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.2650802434975097, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.05502032721415162, "learning_rate": 1.839126269067017e-05, "loss": 0.0022, "num_tokens": 11228785.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 333.75, "completions/mean_terminated_length": 333.75, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.26526471130787677, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0412722360342741, "learning_rate": 1.8387757819337182e-05, "loss": 0.0017, "num_tokens": 11236231.0, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 415.875, "completions/mean_terminated_length": 415.875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.26544917911824384, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.03566544898785651, "learning_rate": 1.8384249468943372e-05, "loss": 0.0014, "num_tokens": 11248934.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.26563364692861097, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.08652997110038996, "learning_rate": 1.8380737640943932e-05, "loss": 0.0035, "num_tokens": 11255269.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.26581811473897804, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.09356641490012407, "learning_rate": 1.8377222336795497e-05, "loss": 0.0037, "num_tokens": 11261068.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/fixed_code_pass_all_test_reward/mean": 0.984375, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 316.5, "completions/mean_terminated_length": 316.5, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.2660025825493451, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.06636513117700815, "learning_rate": 1.8373703557956125e-05, "loss": 0.0027, "num_tokens": 11268424.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.26618705035971224, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.10908025596290827, "learning_rate": 1.837018130588534e-05, "loss": 0.0044, "num_tokens": 11272762.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.2663715181700793, "frac_reward_zero_std": 0.0, "grad_norm": 3.75, "kl": 0.10073700547218323, "learning_rate": 1.8366655582044096e-05, "loss": 0.004, "num_tokens": 11276818.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 183.875, "completions/mean_terminated_length": 183.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.2665559859804464, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.1055462546646595, "learning_rate": 1.8363126387894782e-05, "loss": 0.0042, "num_tokens": 11281945.0, "reward": 1.4431817531585693, "reward_std": 0.22498852014541626, "rewards/fixed_code_pass_all_test_reward/mean": 0.4431818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.22498852014541626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 136.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2667404537908135, "frac_reward_zero_std": 1.0, "grad_norm": 0.19921875, "kl": 0.09557613218203187, "learning_rate": 1.8359593724901235e-05, "loss": 0.0038, "num_tokens": 11288344.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.2669249216011806, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.08257519407197833, "learning_rate": 1.835605759452873e-05, "loss": 0.0033, "num_tokens": 11294822.0, "reward": 1.3854166269302368, "reward_std": 0.38041719794273376, "rewards/fixed_code_pass_all_test_reward/mean": 0.3854166567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.38041722774505615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.26710938941154766, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.05439542979001999, "learning_rate": 1.8352517998243972e-05, "loss": 0.0022, "num_tokens": 11303559.0, "reward": 1.4605262279510498, "reward_std": 0.18608078360557556, "rewards/fixed_code_pass_all_test_reward/mean": 0.46052634716033936, "rewards/fixed_code_pass_all_test_reward/std": 0.18608075380325317, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2672938572219148, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.10128084011375904, "learning_rate": 1.834897493751511e-05, "loss": 0.0041, "num_tokens": 11310927.0, "reward": 1.7834821939468384, "reward_std": 0.056821104139089584, "rewards/fixed_code_pass_all_test_reward/mean": 0.7834820747375488, "rewards/fixed_code_pass_all_test_reward/std": 0.056821078062057495, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 337.25, "completions/mean_terminated_length": 337.25, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.26747832503228186, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.08762197801843286, "learning_rate": 1.834542841381173e-05, "loss": 0.0035, "num_tokens": 11318593.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.26766279284264893, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.10855815280228853, "learning_rate": 1.8341878428604856e-05, "loss": 0.0043, "num_tokens": 11327447.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 732.0, "completions/mean_terminated_length": 732.0, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.26784726065301606, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.034249841002747416, "learning_rate": 1.8338324983366945e-05, "loss": 0.0014, "num_tokens": 11342503.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.26803172846338313, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.10019222926348448, "learning_rate": 1.8334768079571886e-05, "loss": 0.004, "num_tokens": 11348486.0, "reward": 1.587499976158142, "reward_std": 0.710004985332489, "rewards/fixed_code_pass_all_test_reward/mean": 0.7124999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.4189698398113251, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 598.75, "completions/mean_terminated_length": 598.75, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.2682161962737502, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.053723543882369995, "learning_rate": 1.8331207718695005e-05, "loss": 0.0021, "num_tokens": 11360108.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.26840066408411734, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.06031308276578784, "learning_rate": 1.8327643902213066e-05, "loss": 0.0024, "num_tokens": 11367424.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 246.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.2685851318944844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.05409361142665148, "learning_rate": 1.8324076631604263e-05, "loss": 0.0022, "num_tokens": 11373840.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 475.75, "completions/mean_terminated_length": 475.75, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.2687695997048515, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.09757817024365067, "learning_rate": 1.832050590834822e-05, "loss": 0.0039, "num_tokens": 11382774.0, "reward": 1.2142857313156128, "reward_std": 0.41121309995651245, "rewards/fixed_code_pass_all_test_reward/mean": 0.3392857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.16967642307281494, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.2689540675152186, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "kl": 0.12193373870104551, "learning_rate": 1.8316931733926004e-05, "loss": 0.0049, "num_tokens": 11386958.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.2691385353255857, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.14307919889688492, "learning_rate": 1.8313354109820092e-05, "loss": 0.0057, "num_tokens": 11391162.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.26932300313595275, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.10307993786409497, "learning_rate": 1.8309773037514417e-05, "loss": 0.0041, "num_tokens": 11395969.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 340.5, "completions/mean_terminated_length": 340.5, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.2695074709463199, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.1291082901880145, "learning_rate": 1.8306188518494327e-05, "loss": 0.0052, "num_tokens": 11402645.0, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 309.0, "completions/mean_terminated_length": 309.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.26969193875668696, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.07357463892549276, "learning_rate": 1.83026005542466e-05, "loss": 0.0029, "num_tokens": 11412141.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.26987640656705403, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.08011516788974404, "learning_rate": 1.829900914625945e-05, "loss": 0.0032, "num_tokens": 11421561.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 162.625, "completions/mean_terminated_length": 162.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.27006087437742116, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.06783252768218517, "learning_rate": 1.829541429602251e-05, "loss": 0.0027, "num_tokens": 11428366.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.27024534218778823, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.08921116450801492, "learning_rate": 1.8291816005026853e-05, "loss": 0.0036, "num_tokens": 11435278.0, "reward": 1.0108695030212402, "reward_std": 0.030743766576051712, "rewards/fixed_code_pass_all_test_reward/mean": 0.010869565419852734, "rewards/fixed_code_pass_all_test_reward/std": 0.030743775889277458, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 634.625, "completions/mean_terminated_length": 432.71429443359375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.2704298099981553, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.05058129015378654, "learning_rate": 1.828821427476497e-05, "loss": 0.002, "num_tokens": 11445027.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 238.75, "completions/mean_terminated_length": 238.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.27061427780852243, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.08492861036211252, "learning_rate": 1.828460910673078e-05, "loss": 0.0034, "num_tokens": 11452921.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.2707987456188895, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.10649025486782193, "learning_rate": 1.8281000502419626e-05, "loss": 0.0043, "num_tokens": 11456849.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 327.5, "completions/mean_terminated_length": 327.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.2709832134292566, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.07331276824697852, "learning_rate": 1.8277388463328282e-05, "loss": 0.0029, "num_tokens": 11462221.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2711676812396237, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.08681609528139234, "learning_rate": 1.827377299095495e-05, "loss": 0.0035, "num_tokens": 11469405.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2713521490499908, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.09019313007593155, "learning_rate": 1.8270154086799238e-05, "loss": 0.0036, "num_tokens": 11478612.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.27153661686035785, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.20424759574234486, "learning_rate": 1.82665317523622e-05, "loss": 0.0082, "num_tokens": 11489922.0, "reward": 1.581730842590332, "reward_std": 0.013598186895251274, "rewards/fixed_code_pass_all_test_reward/mean": 0.5817307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.013598217628896236, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.271721084670725, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.07676692865788937, "learning_rate": 1.8262905989146293e-05, "loss": 0.0031, "num_tokens": 11495500.0, "reward": 1.8897058963775635, "reward_std": 0.31195884943008423, "rewards/fixed_code_pass_all_test_reward/mean": 0.8897058963775635, "rewards/fixed_code_pass_all_test_reward/std": 0.3119588792324066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 131.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.27190555248109205, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.09007346245925874, "learning_rate": 1.8259276798655413e-05, "loss": 0.0036, "num_tokens": 11499556.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2720900202914591, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.06233324180357158, "learning_rate": 1.8255644182394866e-05, "loss": 0.0025, "num_tokens": 11508091.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.27227448810182625, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.09495929116383195, "learning_rate": 1.8252008141871384e-05, "loss": 0.0038, "num_tokens": 11513282.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.2724589559121933, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.0657485225237906, "learning_rate": 1.8248368678593116e-05, "loss": 0.0026, "num_tokens": 11517139.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.2726434237225604, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.08638333692215383, "learning_rate": 1.8244725794069634e-05, "loss": 0.0035, "num_tokens": 11528204.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.2728278915329275, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.08163391402922571, "learning_rate": 1.8241079489811926e-05, "loss": 0.0033, "num_tokens": 11536594.0, "reward": 1.6363636255264282, "reward_std": 0.2571297585964203, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2730123593432946, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.08824901795014739, "learning_rate": 1.8237429767332407e-05, "loss": 0.0035, "num_tokens": 11542181.0, "reward": 1.703125, "reward_std": 0.09300297498703003, "rewards/fixed_code_pass_all_test_reward/mean": 0.703125, "rewards/fixed_code_pass_all_test_reward/std": 0.09300298243761063, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.27319682715366167, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.08108174009248614, "learning_rate": 1.8233776628144894e-05, "loss": 0.0032, "num_tokens": 11551628.0, "reward": 1.40625, "reward_std": 0.4988826811313629, "rewards/fixed_code_pass_all_test_reward/mean": 0.40625, "rewards/fixed_code_pass_all_test_reward/std": 0.4988826811313629, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 106.75, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.2733812949640288, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.13857180438935757, "learning_rate": 1.8230120073764638e-05, "loss": 0.0055, "num_tokens": 11555378.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.27356576277439587, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.10179591132327914, "learning_rate": 1.8226460105708295e-05, "loss": 0.0041, "num_tokens": 11562827.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 369.375, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.27375023058476294, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.05637198220938444, "learning_rate": 1.8222796725493938e-05, "loss": 0.0023, "num_tokens": 11571574.0, "reward": 1.5699999332427979, "reward_std": 0.3853013515472412, "rewards/fixed_code_pass_all_test_reward/mean": 0.5699999928474426, "rewards/fixed_code_pass_all_test_reward/std": 0.3853013515472412, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.27393469839513007, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.12068467820063233, "learning_rate": 1.8219129934641066e-05, "loss": 0.0048, "num_tokens": 11579629.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.27411916620549714, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.09661076124757528, "learning_rate": 1.8215459734670574e-05, "loss": 0.0039, "num_tokens": 11584121.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.2743036340158642, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.06468256050720811, "learning_rate": 1.821178612710479e-05, "loss": 0.0026, "num_tokens": 11593627.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 194.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.27448810182623135, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.08303600270301104, "learning_rate": 1.8208109113467447e-05, "loss": 0.0033, "num_tokens": 11600900.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 293.5, "completions/mean_terminated_length": 293.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.2746725696365984, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.07944561308249831, "learning_rate": 1.8204428695283686e-05, "loss": 0.0032, "num_tokens": 11610504.0, "reward": 1.5153061151504517, "reward_std": 0.40136465430259705, "rewards/fixed_code_pass_all_test_reward/mean": 0.5153061151504517, "rewards/fixed_code_pass_all_test_reward/std": 0.40136465430259705, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.2748570374469655, "frac_reward_zero_std": 1.0, "grad_norm": 0.1669921875, "kl": 0.07896423060446978, "learning_rate": 1.820074487408007e-05, "loss": 0.0032, "num_tokens": 11616358.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.2750415052573326, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.08702874649316072, "learning_rate": 1.8197057651384564e-05, "loss": 0.0035, "num_tokens": 11621124.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.2752259730676997, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.08965241769328713, "learning_rate": 1.819336702872655e-05, "loss": 0.0036, "num_tokens": 11629675.0, "reward": 1.653846263885498, "reward_std": 0.21365079283714294, "rewards/fixed_code_pass_all_test_reward/mean": 0.6538461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.21365077793598175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.27541044087806676, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.07587167713791132, "learning_rate": 1.8189673007636818e-05, "loss": 0.003, "num_tokens": 11635788.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.2755949086884339, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.05836274288594723, "learning_rate": 1.818597558964757e-05, "loss": 0.0023, "num_tokens": 11643606.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.27577937649880097, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.08689657971262932, "learning_rate": 1.8182274776292414e-05, "loss": 0.0035, "num_tokens": 11648344.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 311.125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.27596384430916804, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.07419036282226443, "learning_rate": 1.8178570569106367e-05, "loss": 0.003, "num_tokens": 11655153.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 368.75, "completions/mean_terminated_length": 368.75, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.27614831211953517, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.05812108190730214, "learning_rate": 1.817486296962586e-05, "loss": 0.0023, "num_tokens": 11663127.0, "reward": 0.5875000357627869, "reward_std": 0.628916323184967, "rewards/fixed_code_pass_all_test_reward/mean": 0.08750000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.09910312294960022, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.27633277992990224, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.12278381222859025, "learning_rate": 1.8171151979388715e-05, "loss": 0.0049, "num_tokens": 11671071.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 109.125, "completions/mean_terminated_length": 109.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.2765172477402693, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.09034707816317677, "learning_rate": 1.8167437599934174e-05, "loss": 0.0036, "num_tokens": 11674784.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.27670171555063644, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.09323649480938911, "learning_rate": 1.8163719832802887e-05, "loss": 0.0037, "num_tokens": 11684922.0, "reward": 1.6666666269302368, "reward_std": 0.4114755690097809, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.41147559881210327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 143.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.2768861833610035, "frac_reward_zero_std": 1.0, "grad_norm": 0.298828125, "kl": 0.11979239527136087, "learning_rate": 1.8159998679536895e-05, "loss": 0.0048, "num_tokens": 11689004.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.2770706511713706, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.09999455977231264, "learning_rate": 1.8156274141679664e-05, "loss": 0.004, "num_tokens": 11694270.0, "reward": 1.5, "reward_std": 0.30860668420791626, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.2772551189817377, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.04819012968800962, "learning_rate": 1.8152546220776036e-05, "loss": 0.0019, "num_tokens": 11702807.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2774395867921048, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.07317747734487057, "learning_rate": 1.8148814918372288e-05, "loss": 0.0029, "num_tokens": 11710697.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.27762405460247186, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.12983882054686546, "learning_rate": 1.814508023601607e-05, "loss": 0.0052, "num_tokens": 11721421.0, "reward": 1.8557692766189575, "reward_std": 0.3370286822319031, "rewards/fixed_code_pass_all_test_reward/mean": 0.8557692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.3370286822319031, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.27780852241283893, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.10068689659237862, "learning_rate": 1.8141342175256457e-05, "loss": 0.004, "num_tokens": 11729999.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.27799299022320606, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.10772743402048945, "learning_rate": 1.8137600737643915e-05, "loss": 0.0043, "num_tokens": 11736787.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.27817745803357313, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "kl": 0.07112570782192051, "learning_rate": 1.8133855924730307e-05, "loss": 0.0028, "num_tokens": 11743510.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.2783619258439402, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.06566012604162097, "learning_rate": 1.81301077380689e-05, "loss": 0.0026, "num_tokens": 11752685.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.27854639365430733, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.10754591785371304, "learning_rate": 1.8126356179214366e-05, "loss": 0.0043, "num_tokens": 11757587.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.2787308614646744, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.12769041769206524, "learning_rate": 1.8122601249722766e-05, "loss": 0.0051, "num_tokens": 11762484.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.2789153292750415, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.08475863235071301, "learning_rate": 1.811884295115157e-05, "loss": 0.0034, "num_tokens": 11771203.0, "reward": 1.564655065536499, "reward_std": 0.25185927748680115, "rewards/fixed_code_pass_all_test_reward/mean": 0.5646551847457886, "rewards/fixed_code_pass_all_test_reward/std": 0.25185927748680115, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2790997970854086, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.0714606549590826, "learning_rate": 1.811508128505963e-05, "loss": 0.0029, "num_tokens": 11776763.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 277.75, "completions/mean_terminated_length": 277.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.2792842648957757, "frac_reward_zero_std": 0.0, "grad_norm": 3.328125, "kl": 0.053778873989358544, "learning_rate": 1.811131625300721e-05, "loss": 0.0022, "num_tokens": 11785361.0, "reward": 1.3125, "reward_std": 0.5303300619125366, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.27946873270614275, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.09292614320293069, "learning_rate": 1.8107547856555967e-05, "loss": 0.0037, "num_tokens": 11790020.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2796532005165099, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.0864118654280901, "learning_rate": 1.8103776097268942e-05, "loss": 0.0035, "num_tokens": 11796892.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 92.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.27983766832687695, "frac_reward_zero_std": 1.0, "grad_norm": 0.494140625, "kl": 0.17758182529360056, "learning_rate": 1.8100000976710584e-05, "loss": 0.0071, "num_tokens": 11800512.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 96.375, "completions/mean_terminated_length": 96.375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.280022136137244, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.11761871632188559, "learning_rate": 1.8096222496446737e-05, "loss": 0.0047, "num_tokens": 11804019.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.28020660394761115, "frac_reward_zero_std": 0.0, "grad_norm": 3.703125, "kl": 0.15076486486941576, "learning_rate": 1.8092440658044622e-05, "loss": 0.006, "num_tokens": 11812271.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 228.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2803910717579782, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.08333424525335431, "learning_rate": 1.8088655463072867e-05, "loss": 0.0033, "num_tokens": 11820110.0, "reward": 1.78125, "reward_std": 0.3604932129383087, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.17816023528575897, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 275.375, "completions/mean_terminated_length": 275.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2805755395683453, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.13739359891042113, "learning_rate": 1.8084866913101494e-05, "loss": 0.0055, "num_tokens": 11830185.0, "reward": 1.8214285373687744, "reward_std": 0.33065006136894226, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.33065006136894226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.28076000737871243, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.10473427269607782, "learning_rate": 1.808107500970191e-05, "loss": 0.0042, "num_tokens": 11834566.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 249.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.2809444751890795, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.06446081819012761, "learning_rate": 1.8077279754446913e-05, "loss": 0.0026, "num_tokens": 11840940.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.2811289429994466, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.08398532820865512, "learning_rate": 1.807348114891069e-05, "loss": 0.0034, "num_tokens": 11851082.0, "reward": 1.4147727489471436, "reward_std": 0.36962395906448364, "rewards/fixed_code_pass_all_test_reward/mean": 0.41477274894714355, "rewards/fixed_code_pass_all_test_reward/std": 0.3696240186691284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.2813134108098137, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.041205348214134574, "learning_rate": 1.806967919466883e-05, "loss": 0.0016, "num_tokens": 11857421.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 343.0, "completions/mean_terminated_length": 343.0, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2814978786201808, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.06032485724426806, "learning_rate": 1.806587389329829e-05, "loss": 0.0024, "num_tokens": 11865077.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.28168234643054785, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0866197943687439, "learning_rate": 1.8062065246377434e-05, "loss": 0.0035, "num_tokens": 11869558.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.281866814240915, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.05953080381732434, "learning_rate": 1.8058253255486005e-05, "loss": 0.0024, "num_tokens": 11876446.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.28205128205128205, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.20552446134388447, "learning_rate": 1.8054437922205135e-05, "loss": 0.0082, "num_tokens": 11881669.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.2822357498616491, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.08373748883605003, "learning_rate": 1.805061924811734e-05, "loss": 0.0033, "num_tokens": 11888699.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 369.75, "completions/mean_terminated_length": 369.75, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.28242021767201625, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.05836206069216132, "learning_rate": 1.8046797234806525e-05, "loss": 0.0023, "num_tokens": 11897913.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2826046854823833, "frac_reward_zero_std": 1.0, "grad_norm": 0.234375, "kl": 0.0900274608284235, "learning_rate": 1.804297188385798e-05, "loss": 0.0036, "num_tokens": 11902367.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.2827891532927504, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.06826892215758562, "learning_rate": 1.8039143196858374e-05, "loss": 0.0027, "num_tokens": 11910931.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 207.5, "completions/mean_terminated_length": 207.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2829736211031175, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.0904827625490725, "learning_rate": 1.803531117539577e-05, "loss": 0.0036, "num_tokens": 11916111.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 194.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.2831580889134846, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.07441342831589282, "learning_rate": 1.80314758210596e-05, "loss": 0.003, "num_tokens": 11924272.0, "reward": 1.7836538553237915, "reward_std": 0.40531522035598755, "rewards/fixed_code_pass_all_test_reward/mean": 0.7836538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.40531525015830994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.28334255672385167, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.15285276621580124, "learning_rate": 1.802763713544069e-05, "loss": 0.0061, "num_tokens": 11929730.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2835270245342188, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.13576223235577345, "learning_rate": 1.8023795120131245e-05, "loss": 0.0054, "num_tokens": 11952140.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.28371149234458587, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.10624599875882268, "learning_rate": 1.801994977672485e-05, "loss": 0.0042, "num_tokens": 11956510.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.28389596015495294, "frac_reward_zero_std": 1.0, "grad_norm": 0.220703125, "kl": 0.07216433668509126, "learning_rate": 1.8016101106816467e-05, "loss": 0.0029, "num_tokens": 11964571.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 249.5, "completions/mean_terminated_length": 249.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.28408042796532007, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.10086528118699789, "learning_rate": 1.8012249112002446e-05, "loss": 0.004, "num_tokens": 11969623.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 514.375, "completions/mean_terminated_length": 514.375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.28426489577568714, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.044673142838291824, "learning_rate": 1.8008393793880508e-05, "loss": 0.0018, "num_tokens": 11983682.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 495.5, "completions/mean_terminated_length": 495.5, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.2844493635860542, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.07773843873292208, "learning_rate": 1.8004535154049753e-05, "loss": 0.0031, "num_tokens": 11998582.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 281.5, "completions/mean_terminated_length": 281.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.28463383139642134, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.07919527078047395, "learning_rate": 1.800067319411067e-05, "loss": 0.0032, "num_tokens": 12005306.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.2848182992067884, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.0948151359334588, "learning_rate": 1.7996807915665106e-05, "loss": 0.0038, "num_tokens": 12014263.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 333.375, "completions/mean_terminated_length": 333.375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.2850027670171555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.054606346879154444, "learning_rate": 1.7992939320316305e-05, "loss": 0.0022, "num_tokens": 12021730.0, "reward": 1.2727272510528564, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2851872348275226, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.06090571964159608, "learning_rate": 1.798906740966887e-05, "loss": 0.0024, "num_tokens": 12028868.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 369.75, "completions/mean_terminated_length": 369.75, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.2853717026378897, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.041323822806589305, "learning_rate": 1.798519218532879e-05, "loss": 0.0017, "num_tokens": 12038866.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 257.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.28555617044825676, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.07614508550614119, "learning_rate": 1.7981313648903422e-05, "loss": 0.003, "num_tokens": 12047489.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2857406382586239, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.14763654116541147, "learning_rate": 1.7977431802001498e-05, "loss": 0.0059, "num_tokens": 12051890.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.28592510606899096, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.08958808984607458, "learning_rate": 1.7973546646233127e-05, "loss": 0.0036, "num_tokens": 12061679.0, "reward": 1.8333333730697632, "reward_std": 0.17817412316799164, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 487.375, "completions/mean_terminated_length": 487.375, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.28610957387935804, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.06604207493364811, "learning_rate": 1.7969658183209788e-05, "loss": 0.0026, "num_tokens": 12079450.0, "reward": 1.8229166269302368, "reward_std": 0.3630708158016205, "rewards/fixed_code_pass_all_test_reward/mean": 0.9479166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.1473139226436615, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 308.5, "completions/mean_terminated_length": 308.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.28629404168972516, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.08501553814858198, "learning_rate": 1.7965766414544328e-05, "loss": 0.0034, "num_tokens": 12086038.0, "reward": 1.3552632331848145, "reward_std": 0.037216171622276306, "rewards/fixed_code_pass_all_test_reward/mean": 0.3552631735801697, "rewards/fixed_code_pass_all_test_reward/std": 0.03721614181995392, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28647850950009224, "frac_reward_zero_std": 0.0, "grad_norm": 13.5, "kl": 0.08756716037169099, "learning_rate": 1.796187134185097e-05, "loss": 0.0035, "num_tokens": 12090274.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.2866629773104593, "frac_reward_zero_std": 1.0, "grad_norm": 0.33203125, "kl": 0.08234629291109741, "learning_rate": 1.7957972966745313e-05, "loss": 0.0033, "num_tokens": 12098076.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.28684744512082644, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.0949358232319355, "learning_rate": 1.795407129084431e-05, "loss": 0.0038, "num_tokens": 12107988.0, "reward": 1.9416667222976685, "reward_std": 0.1130387932062149, "rewards/fixed_code_pass_all_test_reward/mean": 0.9416667222976685, "rewards/fixed_code_pass_all_test_reward/std": 0.11303883045911789, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 388.75, "completions/mean_terminated_length": 388.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.2870319129311935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.03561586351133883, "learning_rate": 1.795016631576629e-05, "loss": 0.0014, "num_tokens": 12116274.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 368.5, "completions/mean_terminated_length": 368.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.2872163807415606, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.08451742120087147, "learning_rate": 1.794625804313096e-05, "loss": 0.0034, "num_tokens": 12126270.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.2874008485519277, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.07210035715252161, "learning_rate": 1.7942346474559386e-05, "loss": 0.0029, "num_tokens": 12136097.0, "reward": 1.8461538553237915, "reward_std": 0.26004743576049805, "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.26004746556282043, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 364.25, "completions/mean_terminated_length": 364.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2875853163622948, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.10585653828456998, "learning_rate": 1.7938431611673996e-05, "loss": 0.0042, "num_tokens": 12148755.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.28776978417266186, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.08519172202795744, "learning_rate": 1.7934513456098597e-05, "loss": 0.0034, "num_tokens": 12157695.0, "reward": 1.09765625, "reward_std": 0.13615156710147858, "rewards/fixed_code_pass_all_test_reward/mean": 0.09765625, "rewards/fixed_code_pass_all_test_reward/std": 0.13615156710147858, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.287954251983029, "frac_reward_zero_std": 1.0, "grad_norm": 0.23046875, "kl": 0.08934322837740183, "learning_rate": 1.7930592009458355e-05, "loss": 0.0036, "num_tokens": 12161715.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 197.0, "completions/mean_terminated_length": 197.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.28813871979339606, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.13560397364199162, "learning_rate": 1.7926667273379798e-05, "loss": 0.0054, "num_tokens": 12166195.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 351.5, "completions/mean_terminated_length": 351.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.28832318760376313, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.10148807801306248, "learning_rate": 1.792273924949082e-05, "loss": 0.0041, "num_tokens": 12173647.0, "reward": 1.1796875, "reward_std": 0.4971318244934082, "rewards/fixed_code_pass_all_test_reward/mean": 0.3046875, "rewards/fixed_code_pass_all_test_reward/std": 0.18731389939785004, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 307.875, "completions/mean_terminated_length": 307.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.28850765541413026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.049828542629256845, "learning_rate": 1.7918807939420692e-05, "loss": 0.002, "num_tokens": 12180038.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 517.125, "completions/mean_terminated_length": 517.125, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.28869212322449733, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.050618676003068686, "learning_rate": 1.791487334480002e-05, "loss": 0.002, "num_tokens": 12193567.0, "reward": 1.240384578704834, "reward_std": 0.10432330518960953, "rewards/fixed_code_pass_all_test_reward/mean": 0.24038462340831757, "rewards/fixed_code_pass_all_test_reward/std": 0.10432329028844833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.2888765910348644, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.10230624116957188, "learning_rate": 1.7910935467260803e-05, "loss": 0.0041, "num_tokens": 12200164.0, "reward": 1.9768518209457397, "reward_std": 0.06547286361455917, "rewards/fixed_code_pass_all_test_reward/mean": 0.9768518209457397, "rewards/fixed_code_pass_all_test_reward/std": 0.06547285616397858, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.28906105884523153, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.10758302873000503, "learning_rate": 1.790699430843638e-05, "loss": 0.0043, "num_tokens": 12211676.0, "reward": 1.3306450843811035, "reward_std": 0.20601192116737366, "rewards/fixed_code_pass_all_test_reward/mean": 0.3306451439857483, "rewards/fixed_code_pass_all_test_reward/std": 0.20601192116737366, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.2892455266555986, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.06866800179705024, "learning_rate": 1.7903049869961457e-05, "loss": 0.0027, "num_tokens": 12223004.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.2894299944659657, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.0997198298573494, "learning_rate": 1.789910215347211e-05, "loss": 0.004, "num_tokens": 12229094.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.2896144622763328, "frac_reward_zero_std": 1.0, "grad_norm": 0.251953125, "kl": 0.10811593942344189, "learning_rate": 1.7895151160605758e-05, "loss": 0.0043, "num_tokens": 12233192.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 340.625, "completions/mean_terminated_length": 340.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.2897989300866999, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.07231669360771775, "learning_rate": 1.7891196893001193e-05, "loss": 0.0029, "num_tokens": 12244749.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.28998339789706695, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.10232794610783458, "learning_rate": 1.7887239352298557e-05, "loss": 0.0041, "num_tokens": 12251573.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 308.625, "completions/mean_terminated_length": 308.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.290167865707434, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.08189977332949638, "learning_rate": 1.7883278540139356e-05, "loss": 0.0033, "num_tokens": 12261410.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.29035233351780115, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.10574955213814974, "learning_rate": 1.7879314458166445e-05, "loss": 0.0042, "num_tokens": 12269013.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2905368013281682, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.08040069369599223, "learning_rate": 1.7875347108024043e-05, "loss": 0.0032, "num_tokens": 12274011.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 267.0, "completions/mean_terminated_length": 267.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.2907212691385353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.08125743782147765, "learning_rate": 1.7871376491357718e-05, "loss": 0.0033, "num_tokens": 12283763.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2909057369489024, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0885463198646903, "learning_rate": 1.78674026098144e-05, "loss": 0.0035, "num_tokens": 12291752.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.2910902047592695, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.05856915167532861, "learning_rate": 1.7863425465042366e-05, "loss": 0.0023, "num_tokens": 12299067.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 365.125, "completions/mean_terminated_length": 365.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2912746725696366, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.05617827083915472, "learning_rate": 1.785944505869125e-05, "loss": 0.0022, "num_tokens": 12308724.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 441.0, "completions/mean_terminated_length": 441.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.2914591403800037, "frac_reward_zero_std": 1.0, "grad_norm": 0.162109375, "kl": 0.04853778867982328, "learning_rate": 1.7855461392412037e-05, "loss": 0.0019, "num_tokens": 12321444.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.2916436081903708, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.07572739850729704, "learning_rate": 1.7851474467857072e-05, "loss": 0.003, "num_tokens": 12333109.0, "reward": 1.140756368637085, "reward_std": 0.11127942055463791, "rewards/fixed_code_pass_all_test_reward/mean": 0.14075630903244019, "rewards/fixed_code_pass_all_test_reward/std": 0.11127939820289612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 238.75, "completions/mean_terminated_length": 238.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.29182807600073785, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.09630253026261926, "learning_rate": 1.784748428668004e-05, "loss": 0.0039, "num_tokens": 12337843.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.292012543811105, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.10691399499773979, "learning_rate": 1.7843490850535983e-05, "loss": 0.0043, "num_tokens": 12347009.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.29219701162147205, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0876840902492404, "learning_rate": 1.7839494161081293e-05, "loss": 0.0035, "num_tokens": 12356792.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 660.0, "completions/mean_terminated_length": 660.0, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.2923814794318391, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.06674418086186051, "learning_rate": 1.783549421997371e-05, "loss": 0.0027, "num_tokens": 12368376.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 434.25, "completions/mean_terminated_length": 434.25, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.29256594724220625, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.10663507878780365, "learning_rate": 1.7831491028872323e-05, "loss": 0.0043, "num_tokens": 12378674.0, "reward": 1.7115384340286255, "reward_std": 0.39864522218704224, "rewards/fixed_code_pass_all_test_reward/mean": 0.7115384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.39864522218704224, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.2927504150525733, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.0652273865416646, "learning_rate": 1.7827484589437568e-05, "loss": 0.0026, "num_tokens": 12387997.0, "reward": 1.9030611515045166, "reward_std": 0.2741842567920685, "rewards/fixed_code_pass_all_test_reward/mean": 0.9030612111091614, "rewards/fixed_code_pass_all_test_reward/std": 0.2741842567920685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 888.375, "completions/mean_terminated_length": 888.375, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.2929348828629404, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.026922443183138967, "learning_rate": 1.782347490333123e-05, "loss": 0.0011, "num_tokens": 12403832.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 380.375, "completions/mean_terminated_length": 380.375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.2931193506733075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.05331743648275733, "learning_rate": 1.7819461972216446e-05, "loss": 0.0021, "num_tokens": 12415555.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.2933038184836746, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.10103878565132618, "learning_rate": 1.7815445797757685e-05, "loss": 0.004, "num_tokens": 12426725.0, "reward": 1.9027777910232544, "reward_std": 0.2749859392642975, "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.2749859690666199, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.29348828629404167, "frac_reward_zero_std": 1.0, "grad_norm": 0.73828125, "kl": 0.1214023120701313, "learning_rate": 1.7811426381620772e-05, "loss": 0.0049, "num_tokens": 12435984.0, "reward": 1.7391304969787598, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.739130437374115, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2936727541044088, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.1086875144392252, "learning_rate": 1.7807403725472877e-05, "loss": 0.0043, "num_tokens": 12443770.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.29385722191477587, "frac_reward_zero_std": 1.0, "grad_norm": 0.396484375, "kl": 0.10329505475237966, "learning_rate": 1.7803377830982506e-05, "loss": 0.0041, "num_tokens": 12453105.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 384.125, "completions/mean_terminated_length": 384.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.29404168972514294, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.06821715040132403, "learning_rate": 1.7799348699819518e-05, "loss": 0.0027, "num_tokens": 12463618.0, "reward": 1.0625, "reward_std": 0.025253789499402046, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.025253813713788986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 475.875, "completions/mean_terminated_length": 475.875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.29422615753551007, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.08961744653061032, "learning_rate": 1.7795316333655107e-05, "loss": 0.0036, "num_tokens": 12472201.0, "reward": 1.8050000667572021, "reward_std": 0.29115530848503113, "rewards/fixed_code_pass_all_test_reward/mean": 0.8050000071525574, "rewards/fixed_code_pass_all_test_reward/std": 0.2911553382873535, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.29441062534587714, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.11212045419961214, "learning_rate": 1.7791280734161808e-05, "loss": 0.0045, "num_tokens": 12476771.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.2945950931562442, "frac_reward_zero_std": 0.0, "grad_norm": 3.328125, "kl": 0.11013233428820968, "learning_rate": 1.778724190301351e-05, "loss": 0.0044, "num_tokens": 12484716.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 329.0, "completions/mean_terminated_length": 329.0, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.29477956096661134, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.06398802949115634, "learning_rate": 1.7783199841885426e-05, "loss": 0.0026, "num_tokens": 12495396.0, "reward": 1.0, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 263.625, "completions/mean_terminated_length": 263.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.2949640287769784, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.06285704881884158, "learning_rate": 1.7779154552454114e-05, "loss": 0.0025, "num_tokens": 12501057.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 468.375, "completions/mean_terminated_length": 468.375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2951484965873455, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.1172210993245244, "learning_rate": 1.7775106036397475e-05, "loss": 0.0047, "num_tokens": 12513940.0, "reward": 1.1022727489471436, "reward_std": 0.1643383502960205, "rewards/fixed_code_pass_all_test_reward/mean": 0.10227273404598236, "rewards/fixed_code_pass_all_test_reward/std": 0.16433832049369812, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 205.0, "completions/mean_terminated_length": 205.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2953329643977126, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.10533142695203424, "learning_rate": 1.777105429539475e-05, "loss": 0.0042, "num_tokens": 12518596.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 313.25, "completions/mean_terminated_length": 313.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.2955174322080797, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.10131804924458265, "learning_rate": 1.7766999331126506e-05, "loss": 0.0041, "num_tokens": 12525390.0, "reward": 0.9684066772460938, "reward_std": 0.36801764369010925, "rewards/fixed_code_pass_all_test_reward/mean": 0.09340660274028778, "rewards/fixed_code_pass_all_test_reward/std": 0.05086923763155937, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.29570190001844676, "frac_reward_zero_std": 1.0, "grad_norm": 0.51171875, "kl": 0.12930389400571585, "learning_rate": 1.7762941145274663e-05, "loss": 0.0052, "num_tokens": 12530062.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 403.375, "completions/mean_terminated_length": 403.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.2958863678288139, "frac_reward_zero_std": 1.0, "grad_norm": 1.0, "kl": 0.13073689490556717, "learning_rate": 1.7758879739522467e-05, "loss": 0.0052, "num_tokens": 12536425.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 350.625, "completions/mean_terminated_length": 350.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.29607083563918096, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.07541354931890965, "learning_rate": 1.7754815115554494e-05, "loss": 0.003, "num_tokens": 12543038.0, "reward": 1.3499999046325684, "reward_std": 0.1414213478565216, "rewards/fixed_code_pass_all_test_reward/mean": 0.3500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 507.125, "completions/mean_terminated_length": 507.125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.29625530344954804, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.048309196485206485, "learning_rate": 1.775074727505667e-05, "loss": 0.0019, "num_tokens": 12552351.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.29643977125991516, "frac_reward_zero_std": 0.0, "grad_norm": 3.6875, "kl": 0.10865723015740514, "learning_rate": 1.7746676219716242e-05, "loss": 0.0043, "num_tokens": 12556050.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 354.5, "completions/mean_terminated_length": 354.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.29662423907028224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.05823033070191741, "learning_rate": 1.77426019512218e-05, "loss": 0.0023, "num_tokens": 12569198.0, "reward": 1.6734693050384521, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6734693646430969, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 619.125, "completions/mean_terminated_length": 619.125, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.2968087068806493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0478515625, "kl": 0.03381074080243707, "learning_rate": 1.7738524471263262e-05, "loss": 0.0014, "num_tokens": 12583223.0, "reward": 1.9506173133850098, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9506173133850098, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 508.875, "completions/mean_terminated_length": 508.875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.29699317469101644, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.08280376438051462, "learning_rate": 1.7734443781531874e-05, "loss": 0.0033, "num_tokens": 12595862.0, "reward": 1.1828703880310059, "reward_std": 0.4809151589870453, "rewards/fixed_code_pass_all_test_reward/mean": 0.30787038803100586, "rewards/fixed_code_pass_all_test_reward/std": 0.1353382170200348, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 302.25, "completions/mean_terminated_length": 302.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2971776425013835, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.05121806706301868, "learning_rate": 1.7730359883720228e-05, "loss": 0.002, "num_tokens": 12603840.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 368.375, "completions/mean_terminated_length": 368.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.2973621103117506, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.044601946137845516, "learning_rate": 1.772627277952223e-05, "loss": 0.0018, "num_tokens": 12612803.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2975465781221177, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.06876032124273479, "learning_rate": 1.772218247063312e-05, "loss": 0.0028, "num_tokens": 12622092.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 368.0, "completions/mean_terminated_length": 368.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.2977310459324848, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.0760411387309432, "learning_rate": 1.7718088958749474e-05, "loss": 0.003, "num_tokens": 12634404.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.29791551374285186, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.07906319946050644, "learning_rate": 1.771399224556919e-05, "loss": 0.0032, "num_tokens": 12640448.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.298099981553219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.06779517233371735, "learning_rate": 1.77098923327915e-05, "loss": 0.0027, "num_tokens": 12644566.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.29828444936358606, "frac_reward_zero_std": 0.0, "grad_norm": 3.328125, "kl": 0.07987850764766335, "learning_rate": 1.7705789222116957e-05, "loss": 0.0032, "num_tokens": 12649820.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 466.875, "completions/mean_terminated_length": 466.875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.29846891717395313, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.060588066931813955, "learning_rate": 1.770168291524744e-05, "loss": 0.0024, "num_tokens": 12663123.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.29865338498432026, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.07768997130915523, "learning_rate": 1.7697573413886154e-05, "loss": 0.0031, "num_tokens": 12667978.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.29883785279468733, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.06340068019926548, "learning_rate": 1.7693460719737644e-05, "loss": 0.0025, "num_tokens": 12674737.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 524.25, "completions/mean_terminated_length": 524.25, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.2990223206050544, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.07209276780486107, "learning_rate": 1.7689344834507756e-05, "loss": 0.0029, "num_tokens": 12687027.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.29920678841542153, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.04901959979906678, "learning_rate": 1.7685225759903675e-05, "loss": 0.002, "num_tokens": 12691236.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.2993912562257886, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.04718544543720782, "learning_rate": 1.76811034976339e-05, "loss": 0.0019, "num_tokens": 12701389.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 267.0, "completions/mean_terminated_length": 267.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2995757240361557, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.08643184415996075, "learning_rate": 1.7676978049408262e-05, "loss": 0.0035, "num_tokens": 12709573.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 342.375, "completions/mean_terminated_length": 342.375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.2997601918465228, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.0739885801449418, "learning_rate": 1.7672849416937903e-05, "loss": 0.003, "num_tokens": 12716136.0, "reward": 1.265625, "reward_std": 0.6828168034553528, "rewards/fixed_code_pass_all_test_reward/mean": 0.515625, "rewards/fixed_code_pass_all_test_reward/std": 0.2538762092590332, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2999446596568899, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.030056790565140545, "learning_rate": 1.766871760193529e-05, "loss": 0.0012, "num_tokens": 12722428.0, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 338.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.30012912746725695, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.062247148947790265, "learning_rate": 1.7664582606114217e-05, "loss": 0.0025, "num_tokens": 12730909.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.3003135952776241, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0587624697946012, "learning_rate": 1.766044443118978e-05, "loss": 0.0024, "num_tokens": 12738161.0, "reward": 1.793269157409668, "reward_std": 0.3554535210132599, "rewards/fixed_code_pass_all_test_reward/mean": 0.9182692170143127, "rewards/fixed_code_pass_all_test_reward/std": 0.052161648869514465, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 344.125, "completions/mean_terminated_length": 344.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.30049806308799115, "frac_reward_zero_std": 1.0, "grad_norm": 0.1826171875, "kl": 0.12019967194646597, "learning_rate": 1.765630307887842e-05, "loss": 0.0048, "num_tokens": 12748554.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 240.0, "completions/mean_terminated_length": 240.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.3006825308983582, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.0808140030130744, "learning_rate": 1.7652158550897868e-05, "loss": 0.0032, "num_tokens": 12755762.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.30086699870872535, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.07831136649474502, "learning_rate": 1.7648010848967186e-05, "loss": 0.0031, "num_tokens": 12765301.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.3010514665190924, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.06648185104131699, "learning_rate": 1.764385997480676e-05, "loss": 0.0027, "num_tokens": 12771069.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 253.25, "completions/mean_terminated_length": 253.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.3012359343294595, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.10702416859567165, "learning_rate": 1.763970593013827e-05, "loss": 0.0043, "num_tokens": 12780231.0, "reward": 1.798387050628662, "reward_std": 0.3733145296573639, "rewards/fixed_code_pass_all_test_reward/mean": 0.7983871102333069, "rewards/fixed_code_pass_all_test_reward/std": 0.3733145594596863, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 448.75, "completions/mean_terminated_length": 448.75, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.3014204021398266, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.04147757729515433, "learning_rate": 1.7635548716684733e-05, "loss": 0.0017, "num_tokens": 12793237.0, "reward": 1.0370370149612427, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.03703703731298447, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 422.0, "completions/mean_terminated_length": 422.0, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.3016048699501937, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.040194901870563626, "learning_rate": 1.7631388336170472e-05, "loss": 0.0016, "num_tokens": 12801005.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 297.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.30178933776056077, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.12538123223930597, "learning_rate": 1.762722479032112e-05, "loss": 0.005, "num_tokens": 12809145.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 399.25, "completions/mean_terminated_length": 399.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3019738055709279, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.08356407471001148, "learning_rate": 1.7623058080863626e-05, "loss": 0.0033, "num_tokens": 12821003.0, "reward": 1.0166666507720947, "reward_std": 0.5348193645477295, "rewards/fixed_code_pass_all_test_reward/mean": 0.14166668057441711, "rewards/fixed_code_pass_all_test_reward/std": 0.3472111225128174, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.302158273381295, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.0558644545963034, "learning_rate": 1.7618888209526246e-05, "loss": 0.0022, "num_tokens": 12827044.0, "reward": 1.796875, "reward_std": 0.5745242834091187, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 556.5, "completions/mean_terminated_length": 556.5, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.30234274119166205, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.03551404387690127, "learning_rate": 1.761471517803856e-05, "loss": 0.0014, "num_tokens": 12842424.0, "reward": 1.7265625, "reward_std": 0.11048543453216553, "rewards/fixed_code_pass_all_test_reward/mean": 0.7265625, "rewards/fixed_code_pass_all_test_reward/std": 0.11048544198274612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3025272090020291, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.06396979955025017, "learning_rate": 1.7610538988131452e-05, "loss": 0.0026, "num_tokens": 12851310.0, "reward": 1.9187500476837158, "reward_std": 0.2298097163438797, "rewards/fixed_code_pass_all_test_reward/mean": 0.918749988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.2298097163438797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 314.75, "completions/mean_terminated_length": 314.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.30271167681239625, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.07790955249220133, "learning_rate": 1.7606359641537116e-05, "loss": 0.0031, "num_tokens": 12860508.0, "reward": 1.611842155456543, "reward_std": 0.6763209104537964, "rewards/fixed_code_pass_all_test_reward/mean": 0.7368420958518982, "rewards/fixed_code_pass_all_test_reward/std": 0.3491183817386627, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.3028961446227633, "frac_reward_zero_std": 1.0, "grad_norm": 0.181640625, "kl": 0.08304731361567974, "learning_rate": 1.7602177139989046e-05, "loss": 0.0033, "num_tokens": 12864899.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 342.25, "completions/mean_terminated_length": 342.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.3030806124331304, "frac_reward_zero_std": 1.0, "grad_norm": 0.05126953125, "kl": 0.04695043433457613, "learning_rate": 1.759799148522206e-05, "loss": 0.0019, "num_tokens": 12874373.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.3032650802434975, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.06385319191031158, "learning_rate": 1.7593802678972272e-05, "loss": 0.0026, "num_tokens": 12879322.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3034495480538646, "frac_reward_zero_std": 1.0, "grad_norm": 0.267578125, "kl": 0.10074855713173747, "learning_rate": 1.7589610722977118e-05, "loss": 0.004, "num_tokens": 12883290.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.30363401586423167, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.10567496158182621, "learning_rate": 1.7585415618975324e-05, "loss": 0.0042, "num_tokens": 12888899.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 323.625, "completions/mean_terminated_length": 323.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.3038184836745988, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.07204781705513597, "learning_rate": 1.7581217368706927e-05, "loss": 0.0029, "num_tokens": 12897184.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 357.0, "completions/mean_terminated_length": 357.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.30400295148496587, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.0890396717004478, "learning_rate": 1.7577015973913278e-05, "loss": 0.0036, "num_tokens": 12904112.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 492.875, "completions/mean_terminated_length": 492.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.30418741929533294, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0479298762511462, "learning_rate": 1.7572811436337014e-05, "loss": 0.0019, "num_tokens": 12913351.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 289.875, "completions/mean_terminated_length": 289.875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.30437188710570007, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.12769436184316874, "learning_rate": 1.7568603757722096e-05, "loss": 0.0051, "num_tokens": 12919550.0, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/fixed_code_pass_all_test_reward/mean": 0.890625, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 169.875, "completions/mean_terminated_length": 169.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.30455635491606714, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.0732993665151298, "learning_rate": 1.756439293981377e-05, "loss": 0.0029, "num_tokens": 12923957.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 321.125, "completions/mean_terminated_length": 321.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.3047408227264342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.08487902581691742, "learning_rate": 1.7560178984358602e-05, "loss": 0.0034, "num_tokens": 12932678.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.30492529053680134, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.06112538347952068, "learning_rate": 1.7555961893104444e-05, "loss": 0.0024, "num_tokens": 12944190.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 357.375, "completions/mean_terminated_length": 357.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.3051097583471684, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0801716479472816, "learning_rate": 1.7551741667800453e-05, "loss": 0.0032, "num_tokens": 12951425.0, "reward": 1.7580645084381104, "reward_std": 0.4483092129230499, "rewards/fixed_code_pass_all_test_reward/mean": 0.8830645084381104, "rewards/fixed_code_pass_all_test_reward/std": 0.3307435214519501, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 340.875, "completions/mean_terminated_length": 340.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.3052942261575355, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.0692962808534503, "learning_rate": 1.7547518310197093e-05, "loss": 0.0028, "num_tokens": 12958288.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 280.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3054786939679026, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.07739010080695152, "learning_rate": 1.7543291822046113e-05, "loss": 0.0031, "num_tokens": 12969456.0, "reward": 1.2196602821350098, "reward_std": 0.01716279797255993, "rewards/fixed_code_pass_all_test_reward/mean": 0.2196601927280426, "rewards/fixed_code_pass_all_test_reward/std": 0.017162786796689034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.3056631617782697, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.08678334299474955, "learning_rate": 1.753906220510058e-05, "loss": 0.0035, "num_tokens": 12973953.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 379.25, "completions/mean_terminated_length": 379.25, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.30584762958863676, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.07424144493415952, "learning_rate": 1.7534829461114836e-05, "loss": 0.003, "num_tokens": 12984067.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 311.5, "completions/mean_terminated_length": 311.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.3060320973990039, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.05165929882787168, "learning_rate": 1.753059359184454e-05, "loss": 0.0021, "num_tokens": 12991319.0, "reward": 1.9765625, "reward_std": 0.03234682232141495, "rewards/fixed_code_pass_all_test_reward/mean": 0.9765625, "rewards/fixed_code_pass_all_test_reward/std": 0.032346826046705246, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 87.375, "completions/mean_terminated_length": 87.375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.30621656520937096, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.1200735280290246, "learning_rate": 1.7526354599046637e-05, "loss": 0.0048, "num_tokens": 12994642.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 362.0, "completions/mean_terminated_length": 362.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.30640103301973803, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.06608133018016815, "learning_rate": 1.752211248447937e-05, "loss": 0.0026, "num_tokens": 13002322.0, "reward": 1.7222222089767456, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7222222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 498.375, "completions/mean_terminated_length": 277.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.30658550083010516, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.09205378778278828, "learning_rate": 1.7517867249902272e-05, "loss": 0.0037, "num_tokens": 13027125.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.30676996864047223, "frac_reward_zero_std": 1.0, "grad_norm": 0.203125, "kl": 0.04283121461048722, "learning_rate": 1.7513618897076183e-05, "loss": 0.0017, "num_tokens": 13035755.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 115.0, "completions/mean_terminated_length": 115.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.3069544364508393, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.07612161291763186, "learning_rate": 1.750936742776322e-05, "loss": 0.003, "num_tokens": 13039419.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.30713890426120644, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.0906263217329979, "learning_rate": 1.7505112843726807e-05, "loss": 0.0036, "num_tokens": 13043568.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.3073233720715735, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.05573075730353594, "learning_rate": 1.750085514673165e-05, "loss": 0.0022, "num_tokens": 13050415.0, "reward": 1.735795497894287, "reward_std": 0.10445894300937653, "rewards/fixed_code_pass_all_test_reward/mean": 0.7357954382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.10445895045995712, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.3075078398819406, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.05252767005003989, "learning_rate": 1.749659433854375e-05, "loss": 0.0021, "num_tokens": 13059632.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 225.25, "completions/mean_terminated_length": 225.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.3076923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.0607409302610904, "learning_rate": 1.7492330420930398e-05, "loss": 0.0024, "num_tokens": 13067218.0, "reward": 1.993181824684143, "reward_std": 0.01928471215069294, "rewards/fixed_code_pass_all_test_reward/mean": 0.9931818246841431, "rewards/fixed_code_pass_all_test_reward/std": 0.01928473263978958, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.3078767755026748, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.13396843429654837, "learning_rate": 1.7488063395660177e-05, "loss": 0.0054, "num_tokens": 13074754.0, "reward": 1.6785714626312256, "reward_std": 0.325023889541626, "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.3250238597393036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.30806124331304185, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.17450826428830624, "learning_rate": 1.7483793264502952e-05, "loss": 0.007, "num_tokens": 13078832.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.308245711123409, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.09287993377074599, "learning_rate": 1.747952002922989e-05, "loss": 0.0037, "num_tokens": 13084262.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 368.625, "completions/mean_terminated_length": 368.625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.30843017893377606, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.03878854983486235, "learning_rate": 1.747524369161343e-05, "loss": 0.0016, "num_tokens": 13097307.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.30861464674414313, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.0627593225799501, "learning_rate": 1.7470964253427305e-05, "loss": 0.0025, "num_tokens": 13109880.0, "reward": 1.4866070747375488, "reward_std": 0.34675586223602295, "rewards/fixed_code_pass_all_test_reward/mean": 0.4866071343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.34675586223602295, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 615.25, "completions/mean_terminated_length": 615.25, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.30879911455451026, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.041732818353921175, "learning_rate": 1.7466681716446536e-05, "loss": 0.0017, "num_tokens": 13126306.0, "reward": 1.234375, "reward_std": 0.19408094882965088, "rewards/fixed_code_pass_all_test_reward/mean": 0.234375, "rewards/fixed_code_pass_all_test_reward/std": 0.19408094882965088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.30898358236487733, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.17682586424052715, "learning_rate": 1.7462396082447422e-05, "loss": 0.0071, "num_tokens": 13130271.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3091680501752444, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.11428327858448029, "learning_rate": 1.745810735320756e-05, "loss": 0.0046, "num_tokens": 13140064.0, "reward": 1.4642857313156128, "reward_std": 0.402253121137619, "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.4022531509399414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 568.125, "completions/mean_terminated_length": 568.125, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.30935251798561153, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.055907859932631254, "learning_rate": 1.7453815530505815e-05, "loss": 0.0022, "num_tokens": 13150985.0, "reward": 1.6964285373687744, "reward_std": 0.4281460642814636, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.33065006136894226, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3095369857959786, "frac_reward_zero_std": 1.0, "grad_norm": 0.40625, "kl": 0.08761957101523876, "learning_rate": 1.7449520616122344e-05, "loss": 0.0035, "num_tokens": 13157883.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.3097214536063457, "frac_reward_zero_std": 0.0, "grad_norm": 4.59375, "kl": 0.1260149898007512, "learning_rate": 1.7445222611838587e-05, "loss": 0.005, "num_tokens": 13161666.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.3099059214167128, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.08866892382502556, "learning_rate": 1.7440921519437258e-05, "loss": 0.0035, "num_tokens": 13166938.0, "reward": 1.7613637447357178, "reward_std": 0.27028122544288635, "rewards/fixed_code_pass_all_test_reward/mean": 0.7613636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.27028125524520874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.3100903892270799, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.07182240299880505, "learning_rate": 1.7436617340702362e-05, "loss": 0.0029, "num_tokens": 13177119.0, "reward": 1.7937500476837158, "reward_std": 0.29932963848114014, "rewards/fixed_code_pass_all_test_reward/mean": 0.793749988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.29932960867881775, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 116.75, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.31027485703744695, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.052422692999243736, "learning_rate": 1.743231007741918e-05, "loss": 0.0021, "num_tokens": 13180877.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 121.0, "completions/mean_terminated_length": 121.0, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.3104593248478141, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.09947034390643239, "learning_rate": 1.742799973137427e-05, "loss": 0.004, "num_tokens": 13189797.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.31064379265818115, "frac_reward_zero_std": 1.0, "grad_norm": 0.1630859375, "kl": 0.08301729685626924, "learning_rate": 1.742368630435547e-05, "loss": 0.0033, "num_tokens": 13198826.0, "reward": 1.25, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3108282604685482, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.09808852430433035, "learning_rate": 1.7419369798151895e-05, "loss": 0.0039, "num_tokens": 13207129.0, "reward": 1.90116286277771, "reward_std": 0.2795538306236267, "rewards/fixed_code_pass_all_test_reward/mean": 0.9011628031730652, "rewards/fixed_code_pass_all_test_reward/std": 0.2795538604259491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.31101272827891535, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.07763095339760184, "learning_rate": 1.7415050214553942e-05, "loss": 0.0031, "num_tokens": 13215297.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 113.375, "completions/mean_terminated_length": 113.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3111971960892824, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05645380727946758, "learning_rate": 1.7410727555353282e-05, "loss": 0.0023, "num_tokens": 13225996.0, "reward": 1.317307710647583, "reward_std": 0.1958465278148651, "rewards/fixed_code_pass_all_test_reward/mean": 0.317307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.1958465576171875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 500.375, "completions/mean_terminated_length": 500.375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.3113816638996495, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.08040047297254205, "learning_rate": 1.7406401822342854e-05, "loss": 0.0032, "num_tokens": 13240239.0, "reward": 0.798076868057251, "reward_std": 0.6879441142082214, "rewards/fixed_code_pass_all_test_reward/mean": 0.17307692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.238868847489357, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 124.625, "completions/mean_terminated_length": 124.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.3115661317100166, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.07063586206641048, "learning_rate": 1.7402073017316887e-05, "loss": 0.0028, "num_tokens": 13244292.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 467.125, "completions/mean_terminated_length": 467.125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.3117505995203837, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.05623632075730711, "learning_rate": 1.739774114207087e-05, "loss": 0.0022, "num_tokens": 13257013.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.31193506733075077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.06074807560071349, "learning_rate": 1.7393406198401572e-05, "loss": 0.0024, "num_tokens": 13263570.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 191.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3121195351411179, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.03830935969017446, "learning_rate": 1.7389068188107036e-05, "loss": 0.0015, "num_tokens": 13272903.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.31230400295148497, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.059072304982692, "learning_rate": 1.7384727112986576e-05, "loss": 0.0024, "num_tokens": 13281844.0, "reward": 1.019230842590332, "reward_std": 0.6827793717384338, "rewards/fixed_code_pass_all_test_reward/mean": 0.26923078298568726, "rewards/fixed_code_pass_all_test_reward/std": 0.3131386339664459, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.31248847076185204, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.05871457979083061, "learning_rate": 1.738038297484077e-05, "loss": 0.0023, "num_tokens": 13288548.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3126729385722192, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.10256790556013584, "learning_rate": 1.737603577547148e-05, "loss": 0.0041, "num_tokens": 13297865.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.31285740638258625, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.07016018684953451, "learning_rate": 1.7371685516681825e-05, "loss": 0.0028, "num_tokens": 13305429.0, "reward": 1.7083332538604736, "reward_std": 0.4249183237552643, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.32120802998542786, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3130418741929533, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.11232127901166677, "learning_rate": 1.7367332200276204e-05, "loss": 0.0045, "num_tokens": 13309149.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.31322634200332045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0517578125, "kl": 0.0494424591306597, "learning_rate": 1.7362975828060274e-05, "loss": 0.002, "num_tokens": 13316974.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.3134108098136875, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.06321074068546295, "learning_rate": 1.7358616401840965e-05, "loss": 0.0025, "num_tokens": 13324609.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 153.25, "completions/mean_terminated_length": 153.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.3135952776240546, "frac_reward_zero_std": 1.0, "grad_norm": 0.22265625, "kl": 0.09891465888358653, "learning_rate": 1.735425392342647e-05, "loss": 0.004, "num_tokens": 13331187.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.3137797454344217, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.07714067678898573, "learning_rate": 1.734988839462626e-05, "loss": 0.0031, "num_tokens": 13335644.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.3139642132447888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.03441539756022394, "learning_rate": 1.7345519817251056e-05, "loss": 0.0014, "num_tokens": 13341736.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 461.25, "completions/mean_terminated_length": 461.25, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.31414868105515587, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.04007669142447412, "learning_rate": 1.7341148193112846e-05, "loss": 0.0016, "num_tokens": 13351210.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.314333148865523, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.13051874982193112, "learning_rate": 1.733677352402489e-05, "loss": 0.0052, "num_tokens": 13360068.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.31451761667589007, "frac_reward_zero_std": 1.0, "grad_norm": 0.1513671875, "kl": 0.08455058466643095, "learning_rate": 1.7332395811801706e-05, "loss": 0.0034, "num_tokens": 13366828.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.31470208448625714, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "kl": 0.12690742779523134, "learning_rate": 1.7328015058259077e-05, "loss": 0.0051, "num_tokens": 13371850.0, "reward": 1.3046875, "reward_std": 0.4509260058403015, "rewards/fixed_code_pass_all_test_reward/mean": 0.3046875, "rewards/fixed_code_pass_all_test_reward/std": 0.4509260058403015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 230.625, "completions/mean_terminated_length": 230.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3148865522966242, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.12414658954367042, "learning_rate": 1.732363126521404e-05, "loss": 0.005, "num_tokens": 13380951.0, "reward": 0.6057692170143127, "reward_std": 0.7043368816375732, "rewards/fixed_code_pass_all_test_reward/mean": 0.10576923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.29916059970855713, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 457.0, "completions/mean_terminated_length": 457.0, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.31507102010699134, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06834477419033647, "learning_rate": 1.73192444344849e-05, "loss": 0.0027, "num_tokens": 13391927.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.3152554879173584, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.08674825076013803, "learning_rate": 1.7314854567891213e-05, "loss": 0.0035, "num_tokens": 13399396.0, "reward": 1.40625, "reward_std": 0.18600594997406006, "rewards/fixed_code_pass_all_test_reward/mean": 0.40625, "rewards/fixed_code_pass_all_test_reward/std": 0.18600596487522125, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.3154399557277255, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.05902333464473486, "learning_rate": 1.7310461667253816e-05, "loss": 0.0024, "num_tokens": 13407357.0, "reward": 1.8901515007019043, "reward_std": 0.3106984496116638, "rewards/fixed_code_pass_all_test_reward/mean": 0.8901515007019043, "rewards/fixed_code_pass_all_test_reward/std": 0.3106984496116638, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.3156244235380926, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.07108163274824619, "learning_rate": 1.7306065734394778e-05, "loss": 0.0028, "num_tokens": 13415421.0, "reward": 1.8344594240188599, "reward_std": 0.2035055160522461, "rewards/fixed_code_pass_all_test_reward/mean": 0.8344594240188599, "rewards/fixed_code_pass_all_test_reward/std": 0.2035055011510849, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3158088913484597, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.05727237625978887, "learning_rate": 1.7301666771137438e-05, "loss": 0.0023, "num_tokens": 13424839.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 159.125, "completions/mean_terminated_length": 159.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.31599335915882676, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.08706144010648131, "learning_rate": 1.729726477930639e-05, "loss": 0.0035, "num_tokens": 13428984.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.3161778269691939, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.05454727518372238, "learning_rate": 1.7292859760727493e-05, "loss": 0.0022, "num_tokens": 13436726.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 283.75, "completions/mean_terminated_length": 283.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.31636229477956096, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.07069510337896645, "learning_rate": 1.7288451717227847e-05, "loss": 0.0028, "num_tokens": 13447236.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 150.75, "completions/mean_terminated_length": 150.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.31654676258992803, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.04003100364934653, "learning_rate": 1.728404065063581e-05, "loss": 0.0016, "num_tokens": 13454634.0, "reward": 1.2916667461395264, "reward_std": 0.41547447443008423, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.31673123040029516, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.07106684963218868, "learning_rate": 1.7279626562781007e-05, "loss": 0.0028, "num_tokens": 13460597.0, "reward": 1.6875, "reward_std": 0.6911429166793823, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 367.625, "completions/mean_terminated_length": 367.625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.31691569821066223, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.07794953975826502, "learning_rate": 1.7275209455494297e-05, "loss": 0.0031, "num_tokens": 13468642.0, "reward": 0.71875, "reward_std": 0.60411536693573, "rewards/fixed_code_pass_all_test_reward/mean": 0.09375, "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.3171001660210293, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.10832982417196035, "learning_rate": 1.727078933060781e-05, "loss": 0.0043, "num_tokens": 13472744.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 699.375, "completions/mean_terminated_length": 699.375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.31728463383139643, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.037384640192613006, "learning_rate": 1.7266366189954906e-05, "loss": 0.0015, "num_tokens": 13489723.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.3174691016417635, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.08884513657540083, "learning_rate": 1.726194003537022e-05, "loss": 0.0036, "num_tokens": 13496408.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.3176535694521306, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.11011776700615883, "learning_rate": 1.725751086868962e-05, "loss": 0.0044, "num_tokens": 13506225.0, "reward": 1.0535714626312256, "reward_std": 0.1515229046344757, "rewards/fixed_code_pass_all_test_reward/mean": 0.0535714291036129, "rewards/fixed_code_pass_all_test_reward/std": 0.15152288973331451, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3178380372624977, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.031162158818915486, "learning_rate": 1.7253078691750228e-05, "loss": 0.0012, "num_tokens": 13513073.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 449.5, "completions/mean_terminated_length": 449.5, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.3180225050728648, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.07304685795679688, "learning_rate": 1.7248643506390418e-05, "loss": 0.0029, "num_tokens": 13522565.0, "reward": 1.0, "reward_std": 1.0690449476242065, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.31820697288323185, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.08278318261727691, "learning_rate": 1.7244205314449808e-05, "loss": 0.0033, "num_tokens": 13530889.0, "reward": 1.8640351295471191, "reward_std": 0.01933232881128788, "rewards/fixed_code_pass_all_test_reward/mean": 0.8640350699424744, "rewards/fixed_code_pass_all_test_reward/std": 0.019332395866513252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.318391440693599, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.06994786602444947, "learning_rate": 1.723976411776926e-05, "loss": 0.0028, "num_tokens": 13537475.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.31857590850396605, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.10048739518970251, "learning_rate": 1.7235319918190897e-05, "loss": 0.004, "num_tokens": 13548874.0, "reward": 1.7708332538604736, "reward_std": 0.17472408711910248, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.17472407221794128, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 217.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.3187603763143331, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.09347222209908068, "learning_rate": 1.7230872717558064e-05, "loss": 0.0037, "num_tokens": 13554906.0, "reward": 1.6160714626312256, "reward_std": 0.31929564476013184, "rewards/fixed_code_pass_all_test_reward/mean": 0.6160714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.31929564476013184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.31894484412470026, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.0574939283542335, "learning_rate": 1.7226422517715372e-05, "loss": 0.0023, "num_tokens": 13563709.0, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 515.75, "completions/mean_terminated_length": 515.75, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.31912931193506733, "frac_reward_zero_std": 1.0, "grad_norm": 0.052734375, "kl": 0.02540092304116115, "learning_rate": 1.7221969320508665e-05, "loss": 0.001, "num_tokens": 13580259.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.3193137797454344, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.042456219205632806, "learning_rate": 1.721751312778504e-05, "loss": 0.0017, "num_tokens": 13588511.0, "reward": 1.84375, "reward_std": 0.29693374037742615, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.29693374037742615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.31949824755580153, "frac_reward_zero_std": 1.0, "grad_norm": 0.427734375, "kl": 0.11974174994975328, "learning_rate": 1.721305394139282e-05, "loss": 0.0048, "num_tokens": 13596965.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 110.75, "completions/mean_terminated_length": 110.75, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.3196827153661686, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.12396978307515383, "learning_rate": 1.720859176318158e-05, "loss": 0.005, "num_tokens": 13600675.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 652.125, "completions/mean_terminated_length": 652.125, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.3198671831765357, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.05711705144494772, "learning_rate": 1.7204126595002137e-05, "loss": 0.0023, "num_tokens": 13619188.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 103.5, "completions/mean_terminated_length": 103.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.3200516509869028, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.06888089678250253, "learning_rate": 1.7199658438706552e-05, "loss": 0.0028, "num_tokens": 13625064.0, "reward": 1.4166667461395264, "reward_std": 0.49601584672927856, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.49601590633392334, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.3202361187972699, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.03660445171408355, "learning_rate": 1.719518729614811e-05, "loss": 0.0015, "num_tokens": 13629780.0, "reward": 1.9375, "reward_std": 0.08625822514295578, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.08625820279121399, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.32042058660763695, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.08010617177933455, "learning_rate": 1.7190713169181353e-05, "loss": 0.0032, "num_tokens": 13640547.0, "reward": 1.754807710647583, "reward_std": 0.3756861686706543, "rewards/fixed_code_pass_all_test_reward/mean": 0.754807710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.3756862282752991, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 304.875, "completions/mean_terminated_length": 304.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.3206050544180041, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.027594240091275424, "learning_rate": 1.718623605966205e-05, "loss": 0.0011, "num_tokens": 13647338.0, "reward": 1.3035714626312256, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.4040610194206238, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 311.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.32078952222837115, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.03876264626160264, "learning_rate": 1.7181755969447208e-05, "loss": 0.0016, "num_tokens": 13655010.0, "reward": 1.3125, "reward_std": 0.7039429545402527, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 170.125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.3209739900387382, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.10156259359791875, "learning_rate": 1.7177272900395067e-05, "loss": 0.0041, "num_tokens": 13660187.0, "reward": 1.3624999523162842, "reward_std": 0.37940531969070435, "rewards/fixed_code_pass_all_test_reward/mean": 0.36250001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.37940534949302673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.32115845784910535, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.07604627264663577, "learning_rate": 1.7172786854365116e-05, "loss": 0.003, "num_tokens": 13667740.0, "reward": 1.7999999523162842, "reward_std": 0.3505098223686218, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.3505098521709442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.3213429256594724, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.06778676714748144, "learning_rate": 1.7168297833218065e-05, "loss": 0.0027, "num_tokens": 13672376.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 177.375, "completions/mean_terminated_length": 177.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3215273934698395, "frac_reward_zero_std": 1.0, "grad_norm": 0.1669921875, "kl": 0.07130752224475145, "learning_rate": 1.7163805838815864e-05, "loss": 0.0029, "num_tokens": 13677331.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 182.75, "completions/mean_terminated_length": 182.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.3217118612802066, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.042329208459705114, "learning_rate": 1.7159310873021694e-05, "loss": 0.0017, "num_tokens": 13682057.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.3218963290905737, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.06558656017296016, "learning_rate": 1.715481293769997e-05, "loss": 0.0026, "num_tokens": 13687430.0, "reward": 1.2916667461395264, "reward_std": 0.4520675837993622, "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.4520675837993622, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 76.5, "completions/mean_terminated_length": 76.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.32208079690094077, "frac_reward_zero_std": 1.0, "grad_norm": 0.1669921875, "kl": 0.1096366704441607, "learning_rate": 1.7150312034716344e-05, "loss": 0.0044, "num_tokens": 13690914.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.3222652647113079, "frac_reward_zero_std": 0.0, "grad_norm": 3.984375, "kl": 0.20319179259240627, "learning_rate": 1.7145808165937684e-05, "loss": 0.0081, "num_tokens": 13699493.0, "reward": 1.220339059829712, "reward_std": 0.6655645966529846, "rewards/fixed_code_pass_all_test_reward/mean": 0.47033900022506714, "rewards/fixed_code_pass_all_test_reward/std": 0.366414338350296, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 119.875, "completions/mean_terminated_length": 119.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.32244973252167497, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.11439052456989884, "learning_rate": 1.7141301333232107e-05, "loss": 0.0046, "num_tokens": 13703404.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 86.125, "completions/mean_terminated_length": 86.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.32263420033204204, "frac_reward_zero_std": 1.0, "grad_norm": 0.58203125, "kl": 0.12312283227220178, "learning_rate": 1.7136791538468938e-05, "loss": 0.0049, "num_tokens": 13706965.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.32281866814240917, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.05349839059635997, "learning_rate": 1.7132278783518756e-05, "loss": 0.0021, "num_tokens": 13713760.0, "reward": 1.52173912525177, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.52173912525177, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.32300313595277624, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.06634359713643789, "learning_rate": 1.7127763070253347e-05, "loss": 0.0027, "num_tokens": 13719886.0, "reward": 1.6770832538604736, "reward_std": 0.35477888584136963, "rewards/fixed_code_pass_all_test_reward/mean": 0.6770833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.35477888584136963, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.3231876037631433, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.08159099030308425, "learning_rate": 1.7123244400545732e-05, "loss": 0.0033, "num_tokens": 13729463.0, "reward": 1.69921875, "reward_std": 0.4537595510482788, "rewards/fixed_code_pass_all_test_reward/mean": 0.82421875, "rewards/fixed_code_pass_all_test_reward/std": 0.3621062934398651, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.32337207157351044, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.05444858060218394, "learning_rate": 1.7118722776270157e-05, "loss": 0.0022, "num_tokens": 13741863.0, "reward": 1.0474138259887695, "reward_std": 0.036574505269527435, "rewards/fixed_code_pass_all_test_reward/mean": 0.047413796186447144, "rewards/fixed_code_pass_all_test_reward/std": 0.03657448664307594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.3235565393838775, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.061421151738613844, "learning_rate": 1.7114198199302093e-05, "loss": 0.0025, "num_tokens": 13748587.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.3237410071942446, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.07919874461367726, "learning_rate": 1.7109670671518245e-05, "loss": 0.0032, "num_tokens": 13755617.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3239254750046117, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.09049553005024791, "learning_rate": 1.7105140194796523e-05, "loss": 0.0036, "num_tokens": 13761606.0, "reward": 1.453125, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.453125, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.3241099428149788, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.06494356342591345, "learning_rate": 1.7100606771016077e-05, "loss": 0.0026, "num_tokens": 13772476.0, "reward": 1.764423131942749, "reward_std": 0.2910597324371338, "rewards/fixed_code_pass_all_test_reward/mean": 0.7644230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.2910597324371338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.32429441062534586, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.064097436144948, "learning_rate": 1.7096070402057272e-05, "loss": 0.0026, "num_tokens": 13780172.0, "reward": 1.3461538553237915, "reward_std": 0.6014915704727173, "rewards/fixed_code_pass_all_test_reward/mean": 0.4711538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.3196508586406708, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 97.625, "completions/mean_terminated_length": 97.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.324478878435713, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.0723551856353879, "learning_rate": 1.7091531089801693e-05, "loss": 0.0029, "num_tokens": 13784017.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 239.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.32466334624608006, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.058560516219586134, "learning_rate": 1.7086988836132153e-05, "loss": 0.0023, "num_tokens": 13790304.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.32484781405644714, "frac_reward_zero_std": 1.0, "grad_norm": 0.82421875, "kl": 0.11947760917246342, "learning_rate": 1.7082443642932683e-05, "loss": 0.0048, "num_tokens": 13796748.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.32503228186681427, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.09552226401865482, "learning_rate": 1.707789551208852e-05, "loss": 0.0038, "num_tokens": 13800580.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.32521674967718134, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.09882478695362806, "learning_rate": 1.7073344445486145e-05, "loss": 0.004, "num_tokens": 13806734.0, "reward": 1.6634615659713745, "reward_std": 0.3082070052623749, "rewards/fixed_code_pass_all_test_reward/mean": 0.6634615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.3082070052623749, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 420.625, "completions/mean_terminated_length": 420.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.3254012174875484, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.06548473844304681, "learning_rate": 1.706879044501323e-05, "loss": 0.0026, "num_tokens": 13817363.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.32558568529791554, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0468357396312058, "learning_rate": 1.7064233512558683e-05, "loss": 0.0019, "num_tokens": 13827655.0, "reward": 1.4736841917037964, "reward_std": 0.371096670627594, "rewards/fixed_code_pass_all_test_reward/mean": 0.4736841917037964, "rewards/fixed_code_pass_all_test_reward/std": 0.3710966110229492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.3257701531082826, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.049599898513406515, "learning_rate": 1.705967365001262e-05, "loss": 0.002, "num_tokens": 13837962.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3259546209186497, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.04760073567740619, "learning_rate": 1.705511085926637e-05, "loss": 0.0019, "num_tokens": 13844987.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.3261390887290168, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.050524459686130285, "learning_rate": 1.7050545142212483e-05, "loss": 0.002, "num_tokens": 13857168.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3263235565393839, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.07036495907232165, "learning_rate": 1.704597650074472e-05, "loss": 0.0028, "num_tokens": 13864182.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 338.0, "completions/mean_terminated_length": 338.0, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.32650802434975096, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.07585422298870981, "learning_rate": 1.7041404936758053e-05, "loss": 0.003, "num_tokens": 13875286.0, "reward": 1.0760869979858398, "reward_std": 0.030743766576051712, "rewards/fixed_code_pass_all_test_reward/mean": 0.07608695328235626, "rewards/fixed_code_pass_all_test_reward/std": 0.030743775889277458, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.3266924921601181, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.09315267764031887, "learning_rate": 1.7036830452148672e-05, "loss": 0.0037, "num_tokens": 13884080.0, "reward": 1.2991070747375488, "reward_std": 0.3328129053115845, "rewards/fixed_code_pass_all_test_reward/mean": 0.2991071343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.3328128457069397, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.32687695997048516, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.05039560096338391, "learning_rate": 1.7032253048813968e-05, "loss": 0.002, "num_tokens": 13888465.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.32706142778085223, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.06407754495739937, "learning_rate": 1.7027672728652554e-05, "loss": 0.0026, "num_tokens": 13892406.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.3272458955912193, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.05676624272018671, "learning_rate": 1.7023089493564248e-05, "loss": 0.0023, "num_tokens": 13897159.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 408.5, "completions/mean_terminated_length": 408.5, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.32743036340158643, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.06385024392511696, "learning_rate": 1.7018503345450076e-05, "loss": 0.0026, "num_tokens": 13906363.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3276148312119535, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.06372689630370587, "learning_rate": 1.7013914286212272e-05, "loss": 0.0025, "num_tokens": 13914689.0, "reward": 1.859375, "reward_std": 0.34785163402557373, "rewards/fixed_code_pass_all_test_reward/mean": 0.859375, "rewards/fixed_code_pass_all_test_reward/std": 0.34785160422325134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.3277992990223206, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05131908506155014, "learning_rate": 1.700932231775428e-05, "loss": 0.0021, "num_tokens": 13921452.0, "reward": 0.9419642686843872, "reward_std": 0.4242715835571289, "rewards/fixed_code_pass_all_test_reward/mean": 0.0669642835855484, "rewards/fixed_code_pass_all_test_reward/std": 0.18940360844135284, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 370.5, "completions/mean_terminated_length": 370.5, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.3279837668326877, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.06482505751773715, "learning_rate": 1.700472744198075e-05, "loss": 0.0026, "num_tokens": 13932176.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 316.625, "completions/mean_terminated_length": 316.625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.3281682346430548, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.038054858800023794, "learning_rate": 1.7000129660797534e-05, "loss": 0.0015, "num_tokens": 13939637.0, "reward": 0.886363685131073, "reward_std": 0.5470755696296692, "rewards/fixed_code_pass_all_test_reward/mean": 0.13636364042758942, "rewards/fixed_code_pass_all_test_reward/std": 0.08416546881198883, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.32835270245342185, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.06497360905632377, "learning_rate": 1.6995528976111695e-05, "loss": 0.0026, "num_tokens": 13944387.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 435.875, "completions/mean_terminated_length": 435.875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.328537170263789, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.04146002361085266, "learning_rate": 1.6990925389831497e-05, "loss": 0.0017, "num_tokens": 13953226.0, "reward": 1.59375, "reward_std": 0.376485139131546, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.3764851689338684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.32872163807415605, "frac_reward_zero_std": 1.0, "grad_norm": 0.1982421875, "kl": 0.06650712410919368, "learning_rate": 1.6986318903866408e-05, "loss": 0.0027, "num_tokens": 13960370.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 113.875, "completions/mean_terminated_length": 113.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3289061058845231, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.08997930819168687, "learning_rate": 1.6981709520127102e-05, "loss": 0.0036, "num_tokens": 13966393.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.32909057369489025, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.06005093525163829, "learning_rate": 1.6977097240525443e-05, "loss": 0.0024, "num_tokens": 13973469.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 352.375, "completions/mean_terminated_length": 352.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.3292750415052573, "frac_reward_zero_std": 0.0, "grad_norm": 3.75, "kl": 0.1745684975758195, "learning_rate": 1.6972482066974517e-05, "loss": 0.007, "num_tokens": 13983952.0, "reward": 1.4066901206970215, "reward_std": 0.5714432001113892, "rewards/fixed_code_pass_all_test_reward/mean": 0.5316901206970215, "rewards/fixed_code_pass_all_test_reward/std": 0.22279103100299835, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 344.0, "completions/mean_terminated_length": 344.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.3294595093156244, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.07473253342323005, "learning_rate": 1.696786400138859e-05, "loss": 0.003, "num_tokens": 13994208.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.3296439771259915, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.07195319165475667, "learning_rate": 1.696324304568314e-05, "loss": 0.0029, "num_tokens": 14000058.0, "reward": 1.0725805759429932, "reward_std": 0.030277149751782417, "rewards/fixed_code_pass_all_test_reward/mean": 0.07258065044879913, "rewards/fixed_code_pass_all_test_reward/std": 0.03027711808681488, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.3298284449363586, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.04790842649526894, "learning_rate": 1.6958619201774835e-05, "loss": 0.0019, "num_tokens": 14007326.0, "reward": 1.83152174949646, "reward_std": 0.34504908323287964, "rewards/fixed_code_pass_all_test_reward/mean": 0.83152174949646, "rewards/fixed_code_pass_all_test_reward/std": 0.34504908323287964, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 1042.625, "completions/mean_terminated_length": 707.5, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.3300129127467257, "frac_reward_zero_std": 0.0, "grad_norm": 0.71875, "kl": 0.021890265576075763, "learning_rate": 1.695399247158155e-05, "loss": 0.0009, "num_tokens": 14023555.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 299.875, "completions/mean_terminated_length": 299.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.3301973805570928, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.09823067160323262, "learning_rate": 1.6949362857022352e-05, "loss": 0.0039, "num_tokens": 14032282.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 556.125, "completions/mean_terminated_length": 556.125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.3303818483674599, "frac_reward_zero_std": 1.0, "grad_norm": 0.03369140625, "kl": 0.0172413062537089, "learning_rate": 1.6944730360017506e-05, "loss": 0.0007, "num_tokens": 14044987.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.33056631617782695, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.08641867013648152, "learning_rate": 1.6940094982488465e-05, "loss": 0.0035, "num_tokens": 14049311.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.3307507839881941, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.07004620973020792, "learning_rate": 1.693545672635789e-05, "loss": 0.0028, "num_tokens": 14058530.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 412.5, "completions/mean_terminated_length": 412.5, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.33093525179856115, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.04709272366017103, "learning_rate": 1.6930815593549632e-05, "loss": 0.0019, "num_tokens": 14068726.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3311197196089282, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.06261751311831176, "learning_rate": 1.6926171585988728e-05, "loss": 0.0025, "num_tokens": 14075667.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 294.25, "completions/mean_terminated_length": 294.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.33130418741929535, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.04316610051319003, "learning_rate": 1.692152470560141e-05, "loss": 0.0017, "num_tokens": 14084381.0, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3314886552296624, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.12030347739346325, "learning_rate": 1.6916874954315107e-05, "loss": 0.0048, "num_tokens": 14089757.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 433.875, "completions/mean_terminated_length": 433.875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.3316731230400295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.030831653391942382, "learning_rate": 1.6912222334058438e-05, "loss": 0.0012, "num_tokens": 14098876.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.3318575908503966, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.030932712834328413, "learning_rate": 1.6907566846761203e-05, "loss": 0.0012, "num_tokens": 14108414.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 334.0, "completions/mean_terminated_length": 334.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.3320420586607637, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.0728874730411917, "learning_rate": 1.69029084943544e-05, "loss": 0.0029, "num_tokens": 14120766.0, "reward": 1.1964285373687744, "reward_std": 0.2665691673755646, "rewards/fixed_code_pass_all_test_reward/mean": 0.196428582072258, "rewards/fixed_code_pass_all_test_reward/std": 0.2665691375732422, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 450.875, "completions/mean_terminated_length": 450.875, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.33222652647113077, "frac_reward_zero_std": 1.0, "grad_norm": 0.052001953125, "kl": 0.039789131842553616, "learning_rate": 1.6898247278770217e-05, "loss": 0.0016, "num_tokens": 14130157.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.3324109942814979, "frac_reward_zero_std": 1.0, "grad_norm": 0.04248046875, "kl": 0.022947333403863013, "learning_rate": 1.6893583201942023e-05, "loss": 0.0009, "num_tokens": 14138413.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.33259546209186497, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.0793593842536211, "learning_rate": 1.6888916265804374e-05, "loss": 0.0032, "num_tokens": 14147908.0, "reward": 1.4932432174682617, "reward_std": 0.2689155042171478, "rewards/fixed_code_pass_all_test_reward/mean": 0.4932432472705841, "rewards/fixed_code_pass_all_test_reward/std": 0.2689155340194702, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 215.25, "completions/mean_terminated_length": 215.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.33277992990223204, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.06717079039663076, "learning_rate": 1.6884246472293018e-05, "loss": 0.0027, "num_tokens": 14152430.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.33296439771259917, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.07996788760647178, "learning_rate": 1.6879573823344878e-05, "loss": 0.0032, "num_tokens": 14158524.0, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.33314886552296624, "frac_reward_zero_std": 1.0, "grad_norm": 0.326171875, "kl": 0.1264201975427568, "learning_rate": 1.6874898320898078e-05, "loss": 0.0051, "num_tokens": 14162483.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.3333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.07288231747224927, "learning_rate": 1.687021996689191e-05, "loss": 0.0029, "num_tokens": 14167653.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 326.875, "completions/mean_terminated_length": 326.875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.33351780114370044, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.047797456150874496, "learning_rate": 1.686553876326685e-05, "loss": 0.0019, "num_tokens": 14177284.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3337022689540675, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.04237889649812132, "learning_rate": 1.6860854711964566e-05, "loss": 0.0017, "num_tokens": 14182993.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.3338867367644346, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.06251822551712394, "learning_rate": 1.68561678149279e-05, "loss": 0.0025, "num_tokens": 14191709.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.3340712045748017, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.052541672717779875, "learning_rate": 1.6851478074100878e-05, "loss": 0.0021, "num_tokens": 14198699.0, "reward": 1.8852040767669678, "reward_std": 0.12718607485294342, "rewards/fixed_code_pass_all_test_reward/mean": 0.8852040767669678, "rewards/fixed_code_pass_all_test_reward/std": 0.12718605995178223, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.3342556723851688, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.08098199497908354, "learning_rate": 1.6846785491428697e-05, "loss": 0.0032, "num_tokens": 14207353.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 407.375, "completions/mean_terminated_length": 407.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.33444014019553586, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.06894407281652093, "learning_rate": 1.6842090068857743e-05, "loss": 0.0028, "num_tokens": 14223052.0, "reward": 1.875, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.334624608005903, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0704963922034949, "learning_rate": 1.6837391808335577e-05, "loss": 0.0028, "num_tokens": 14232744.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 349.0, "completions/mean_terminated_length": 349.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.33480907581627006, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.08252359693869948, "learning_rate": 1.6832690711810935e-05, "loss": 0.0033, "num_tokens": 14239184.0, "reward": 1.725000023841858, "reward_std": 0.7005100250244141, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.3505098223686218, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.33499354362663714, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.10387273645028472, "learning_rate": 1.682798678123373e-05, "loss": 0.0042, "num_tokens": 14247841.0, "reward": 0.9950000047683716, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.11999999731779099, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 111.0, "completions/mean_terminated_length": 111.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.33517801143700426, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.053496206644922495, "learning_rate": 1.6823280018555047e-05, "loss": 0.0021, "num_tokens": 14251593.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 539.875, "completions/mean_terminated_length": 539.875, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.33536247924737134, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.03457971604075283, "learning_rate": 1.6818570425727158e-05, "loss": 0.0014, "num_tokens": 14262264.0, "reward": 1.3125, "reward_std": 0.12626908719539642, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.12626907229423523, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.3355469470577384, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.0786975771188736, "learning_rate": 1.6813858004703493e-05, "loss": 0.0031, "num_tokens": 14271567.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.33573141486810554, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.07118736952543259, "learning_rate": 1.6809142757438667e-05, "loss": 0.0028, "num_tokens": 14276000.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 405.5, "completions/mean_terminated_length": 405.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3359158826784726, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.045538471546024084, "learning_rate": 1.680442468588846e-05, "loss": 0.0018, "num_tokens": 14284372.0, "reward": 1.6964285373687744, "reward_std": 0.3452087938785553, "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.3452087640762329, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3361003504888397, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0654539312236011, "learning_rate": 1.679970379200983e-05, "loss": 0.0026, "num_tokens": 14295513.0, "reward": 1.5144230127334595, "reward_std": 0.5253254175186157, "rewards/fixed_code_pass_all_test_reward/mean": 0.6394230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.29858604073524475, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.3362848182992068, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.07510208757594228, "learning_rate": 1.6794980077760894e-05, "loss": 0.003, "num_tokens": 14299830.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.3364692861095739, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.07100888155400753, "learning_rate": 1.679025354510095e-05, "loss": 0.0028, "num_tokens": 14304289.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 330.125, "completions/mean_terminated_length": 330.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.33665375391994096, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0905956863425672, "learning_rate": 1.6785524195990467e-05, "loss": 0.0036, "num_tokens": 14313610.0, "reward": 1.4375, "reward_std": 0.7288689613342285, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.3368382217303081, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.04272888752166182, "learning_rate": 1.6780792032391074e-05, "loss": 0.0017, "num_tokens": 14322288.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.33702268954067516, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.06999608455225825, "learning_rate": 1.6776057056265565e-05, "loss": 0.0028, "num_tokens": 14328195.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.33720715735104223, "frac_reward_zero_std": 0.0, "grad_norm": 3.578125, "kl": 0.13367241295054555, "learning_rate": 1.6771319269577917e-05, "loss": 0.0053, "num_tokens": 14332134.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 449.5, "completions/mean_terminated_length": 449.5, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.33739162516140936, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.060629066079854965, "learning_rate": 1.676657867429325e-05, "loss": 0.0024, "num_tokens": 14340770.0, "reward": 0.8625000715255737, "reward_std": 0.44380658864974976, "rewards/fixed_code_pass_all_test_reward/mean": 0.11250000447034836, "rewards/fixed_code_pass_all_test_reward/std": 0.06408699601888657, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 144.5, "completions/mean_terminated_length": 144.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.33757609297177643, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.07964523741975427, "learning_rate": 1.6761835272377866e-05, "loss": 0.0032, "num_tokens": 14344846.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 451.25, "completions/mean_terminated_length": 451.25, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.3377605607821435, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.06898640841245651, "learning_rate": 1.6757089065799226e-05, "loss": 0.0028, "num_tokens": 14353392.0, "reward": 1.9464285373687744, "reward_std": 0.1062890887260437, "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.10628911107778549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.33794502859251063, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.04422305093612522, "learning_rate": 1.6752340056525953e-05, "loss": 0.0018, "num_tokens": 14362389.0, "reward": 1.1749999523162842, "reward_std": 0.6453127861022949, "rewards/fixed_code_pass_all_test_reward/mean": 0.42500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.2712405323982239, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 240.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.3381294964028777, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.09641394019126892, "learning_rate": 1.6747588246527834e-05, "loss": 0.0039, "num_tokens": 14371313.0, "reward": 1.2236841917037964, "reward_std": 0.17512504756450653, "rewards/fixed_code_pass_all_test_reward/mean": 0.22368420660495758, "rewards/fixed_code_pass_all_test_reward/std": 0.17512504756450653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 366.75, "completions/mean_terminated_length": 366.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.3383139642132448, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.05195535160601139, "learning_rate": 1.6742833637775814e-05, "loss": 0.0021, "num_tokens": 14378311.0, "reward": 1.25, "reward_std": 0.10101527720689774, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.3384984320236119, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.08346772566437721, "learning_rate": 1.6738076232242007e-05, "loss": 0.0033, "num_tokens": 14389025.0, "reward": 1.0340908765792847, "reward_std": 0.09642363339662552, "rewards/fixed_code_pass_all_test_reward/mean": 0.034090910106897354, "rewards/fixed_code_pass_all_test_reward/std": 0.0964236631989479, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.338682899833979, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.09666198492050171, "learning_rate": 1.673331603189968e-05, "loss": 0.0039, "num_tokens": 14398183.0, "reward": 1.9272727966308594, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9272727370262146, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 335.625, "completions/mean_terminated_length": 335.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.33886736764434605, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.03360424540005624, "learning_rate": 1.672855303872326e-05, "loss": 0.0013, "num_tokens": 14404308.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 466.75, "completions/mean_terminated_length": 466.75, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.3390518354547132, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.05345094855874777, "learning_rate": 1.672378725468834e-05, "loss": 0.0021, "num_tokens": 14417426.0, "reward": 1.326923131942749, "reward_std": 0.43659716844558716, "rewards/fixed_code_pass_all_test_reward/mean": 0.32692307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.43659716844558716, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.33923630326508025, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.10190612683072686, "learning_rate": 1.6719018681771652e-05, "loss": 0.0041, "num_tokens": 14424710.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 302.75, "completions/mean_terminated_length": 302.75, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.3394207710754473, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.03227280848659575, "learning_rate": 1.6714247321951106e-05, "loss": 0.0013, "num_tokens": 14432044.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.3396052388858144, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.11663768393918872, "learning_rate": 1.670947317720576e-05, "loss": 0.0047, "num_tokens": 14437054.0, "reward": 1.7138159275054932, "reward_std": 0.16934514045715332, "rewards/fixed_code_pass_all_test_reward/mean": 0.7138158082962036, "rewards/fixed_code_pass_all_test_reward/std": 0.16934515535831451, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 562.125, "completions/mean_terminated_length": 562.125, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.3397897066961815, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.02897809841670096, "learning_rate": 1.670469624951582e-05, "loss": 0.0012, "num_tokens": 14448703.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3399741745065486, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.09394017769955099, "learning_rate": 1.6699916540862663e-05, "loss": 0.0038, "num_tokens": 14452867.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.34015864231691567, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.04340591165237129, "learning_rate": 1.6695134053228796e-05, "loss": 0.0017, "num_tokens": 14457228.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 308.375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.3403431101272828, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.03579376148991287, "learning_rate": 1.6690348788597896e-05, "loss": 0.0014, "num_tokens": 14464799.0, "reward": 1.578125, "reward_std": 0.2603869140148163, "rewards/fixed_code_pass_all_test_reward/mean": 0.578125, "rewards/fixed_code_pass_all_test_reward/std": 0.2603869140148163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3405275779376499, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.0751229664310813, "learning_rate": 1.668556074895479e-05, "loss": 0.003, "num_tokens": 14472142.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.34071204574801695, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.08000096306204796, "learning_rate": 1.6680769936285456e-05, "loss": 0.0032, "num_tokens": 14477331.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.3408965135583841, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.05432684707920998, "learning_rate": 1.6675976352577016e-05, "loss": 0.0022, "num_tokens": 14484834.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.34108098136875115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.08251290349289775, "learning_rate": 1.667117999981774e-05, "loss": 0.0033, "num_tokens": 14493462.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.3412654491791182, "frac_reward_zero_std": 1.0, "grad_norm": 0.4453125, "kl": 0.11082548508420587, "learning_rate": 1.6666380879997063e-05, "loss": 0.0044, "num_tokens": 14497354.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 617.875, "completions/mean_terminated_length": 617.875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.34144991698948535, "frac_reward_zero_std": 1.0, "grad_norm": 0.033447265625, "kl": 0.024522725492715836, "learning_rate": 1.6661578995105545e-05, "loss": 0.001, "num_tokens": 14514145.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 193.375, "completions/mean_terminated_length": 193.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.3416343847998524, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.08688396867364645, "learning_rate": 1.6656774347134907e-05, "loss": 0.0035, "num_tokens": 14521476.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 279.5, "completions/mean_terminated_length": 279.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.3418188526102195, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.10130987642332911, "learning_rate": 1.6651966938078018e-05, "loss": 0.0041, "num_tokens": 14527648.0, "reward": 1.703125, "reward_std": 0.45285552740097046, "rewards/fixed_code_pass_all_test_reward/mean": 0.828125, "rewards/fixed_code_pass_all_test_reward/std": 0.35942351818084717, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.3420033204205866, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.14331673923879862, "learning_rate": 1.664715676992889e-05, "loss": 0.0057, "num_tokens": 14538911.0, "reward": 1.5892857313156128, "reward_std": 0.4561692476272583, "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.4561692476272583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.3421877882309537, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.03531209426000714, "learning_rate": 1.6642343844682664e-05, "loss": 0.0014, "num_tokens": 14545976.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.34237225604132077, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.05379305640235543, "learning_rate": 1.6637528164335645e-05, "loss": 0.0022, "num_tokens": 14553110.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 245.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.3425567238516879, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.09437270276248455, "learning_rate": 1.6632709730885278e-05, "loss": 0.0038, "num_tokens": 14559196.0, "reward": 1.6224489212036133, "reward_std": 0.43879249691963196, "rewards/fixed_code_pass_all_test_reward/mean": 0.6224490404129028, "rewards/fixed_code_pass_all_test_reward/std": 0.43879246711730957, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.34274119166205497, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.06047657527960837, "learning_rate": 1.6627888546330136e-05, "loss": 0.0024, "num_tokens": 14565998.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.34292565947242204, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.05493861041031778, "learning_rate": 1.662306461266995e-05, "loss": 0.0022, "num_tokens": 14575118.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.34311012728278917, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.13241078704595566, "learning_rate": 1.6618237931905578e-05, "loss": 0.0053, "num_tokens": 14579269.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.34329459509315624, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.08901078114286065, "learning_rate": 1.661340850603903e-05, "loss": 0.0036, "num_tokens": 14584903.0, "reward": 1.625, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 236.0, "completions/mean_terminated_length": 236.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.3434790629035233, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.12841561064124107, "learning_rate": 1.6608576337073437e-05, "loss": 0.0051, "num_tokens": 14589567.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 112.625, "completions/mean_terminated_length": 112.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.34366353071389044, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.14813246298581362, "learning_rate": 1.6603741427013087e-05, "loss": 0.0059, "num_tokens": 14593252.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 305.25, "completions/mean_terminated_length": 305.25, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.3438479985242575, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.04706785595044494, "learning_rate": 1.6598903777863393e-05, "loss": 0.0019, "num_tokens": 14599654.0, "reward": 1.828125, "reward_std": 0.3199993073940277, "rewards/fixed_code_pass_all_test_reward/mean": 0.828125, "rewards/fixed_code_pass_all_test_reward/std": 0.3199993073940277, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3440324663346246, "frac_reward_zero_std": 1.0, "grad_norm": 0.232421875, "kl": 0.10468695918098092, "learning_rate": 1.6594063391630907e-05, "loss": 0.0042, "num_tokens": 14603690.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3442169341449917, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.06596513045951724, "learning_rate": 1.6589220270323316e-05, "loss": 0.0026, "num_tokens": 14610319.0, "reward": 1.2441861629486084, "reward_std": 0.09866604208946228, "rewards/fixed_code_pass_all_test_reward/mean": 0.24418605864048004, "rewards/fixed_code_pass_all_test_reward/std": 0.09866606444120407, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 262.125, "completions/mean_terminated_length": 262.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.3444014019553588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.04148223507218063, "learning_rate": 1.6584374415949446e-05, "loss": 0.0017, "num_tokens": 14618624.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.34458586976572586, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.06318472512066364, "learning_rate": 1.6579525830519248e-05, "loss": 0.0025, "num_tokens": 14627564.0, "reward": 1.901898741722107, "reward_std": 0.18164823949337006, "rewards/fixed_code_pass_all_test_reward/mean": 0.9018987417221069, "rewards/fixed_code_pass_all_test_reward/std": 0.18164826929569244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 400.25, "completions/mean_terminated_length": 400.25, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.344770337576093, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.04568952368572354, "learning_rate": 1.6574674516043813e-05, "loss": 0.0018, "num_tokens": 14636590.0, "reward": 1.8799999952316284, "reward_std": 0.23518992960453033, "rewards/fixed_code_pass_all_test_reward/mean": 0.8799999952316284, "rewards/fixed_code_pass_all_test_reward/std": 0.23518989980220795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.34495480538646006, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.07545109605416656, "learning_rate": 1.6569820474535362e-05, "loss": 0.003, "num_tokens": 14645099.0, "reward": 1.2624999284744263, "reward_std": 0.1060660108923912, "rewards/fixed_code_pass_all_test_reward/mean": 0.26250001788139343, "rewards/fixed_code_pass_all_test_reward/std": 0.1060660257935524, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 362.25, "completions/mean_terminated_length": 362.25, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.34513927319682713, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.028960632858797908, "learning_rate": 1.6564963708007248e-05, "loss": 0.0012, "num_tokens": 14653525.0, "reward": 1.7638888359069824, "reward_std": 0.4382001459598541, "rewards/fixed_code_pass_all_test_reward/mean": 0.7638888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.4382002055644989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.34532374100719426, "frac_reward_zero_std": 1.0, "grad_norm": 3.046875, "kl": 0.32261311635375023, "learning_rate": 1.6560104218473946e-05, "loss": 0.0129, "num_tokens": 14662282.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.34550820881756134, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.05735359154641628, "learning_rate": 1.6555242007951076e-05, "loss": 0.0023, "num_tokens": 14672523.0, "reward": 0.9833333492279053, "reward_std": 0.49984124302864075, "rewards/fixed_code_pass_all_test_reward/mean": 0.10833333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.306412935256958, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3456926766279284, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.1050928570330143, "learning_rate": 1.6550377078455373e-05, "loss": 0.0042, "num_tokens": 14680821.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.34587714443829554, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.08572117099538445, "learning_rate": 1.6545509432004707e-05, "loss": 0.0034, "num_tokens": 14688243.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.3460616122486626, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.07859274535439909, "learning_rate": 1.654063907061807e-05, "loss": 0.0031, "num_tokens": 14695771.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 555.625, "completions/mean_terminated_length": 555.625, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.3462460800590297, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.04593815095722675, "learning_rate": 1.6535765996315585e-05, "loss": 0.0018, "num_tokens": 14710504.0, "reward": 1.811475396156311, "reward_std": 0.3730055093765259, "rewards/fixed_code_pass_all_test_reward/mean": 0.811475396156311, "rewards/fixed_code_pass_all_test_reward/std": 0.37300553917884827, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.3464305478693968, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.04887577937915921, "learning_rate": 1.6530890211118494e-05, "loss": 0.002, "num_tokens": 14717411.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3466150156797639, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.0533945532515645, "learning_rate": 1.6526011717049173e-05, "loss": 0.0021, "num_tokens": 14721314.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.34679948349013096, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.07130508404225111, "learning_rate": 1.6521130516131116e-05, "loss": 0.0029, "num_tokens": 14730815.0, "reward": 1.375, "reward_std": 0.6770032048225403, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.4364357888698578, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 334.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.3469839513004981, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.03924826974980533, "learning_rate": 1.6516246610388935e-05, "loss": 0.0016, "num_tokens": 14741324.0, "reward": 1.8035714626312256, "reward_std": 0.34940600395202637, "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.34940600395202637, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 108.875, "completions/mean_terminated_length": 108.875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.34716841911086516, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.10823609447106719, "learning_rate": 1.6511360001848368e-05, "loss": 0.0043, "num_tokens": 14749339.0, "reward": 1.5446429252624512, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.5446428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 340.5, "completions/mean_terminated_length": 340.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.34735288692123223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.041260997066274285, "learning_rate": 1.6506470692536282e-05, "loss": 0.0017, "num_tokens": 14757367.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 332.25, "completions/mean_terminated_length": 332.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.34753735473159936, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.08893146039918065, "learning_rate": 1.650157868448065e-05, "loss": 0.0036, "num_tokens": 14770193.0, "reward": 1.0517241954803467, "reward_std": 0.08086924999952316, "rewards/fixed_code_pass_all_test_reward/mean": 0.051724135875701904, "rewards/fixed_code_pass_all_test_reward/std": 0.08086923509836197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 518.75, "completions/mean_terminated_length": 518.75, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.34772182254196643, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.043838605750352144, "learning_rate": 1.6496683979710576e-05, "loss": 0.0018, "num_tokens": 14781199.0, "reward": 1.0, "reward_std": 0.40824827551841736, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.07715167850255966, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 104.375, "completions/mean_terminated_length": 104.375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.3479062903523335, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.07627957640215755, "learning_rate": 1.6491786580256273e-05, "loss": 0.0031, "num_tokens": 14784778.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.34809075816270063, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.05368952197022736, "learning_rate": 1.648688648814908e-05, "loss": 0.0021, "num_tokens": 14792146.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.3482752259730677, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.07783088739961386, "learning_rate": 1.648198370542145e-05, "loss": 0.0031, "num_tokens": 14797302.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 328.25, "completions/mean_terminated_length": 328.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.3484596937834348, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0637719389051199, "learning_rate": 1.647707823410695e-05, "loss": 0.0026, "num_tokens": 14809200.0, "reward": 1.4940476417541504, "reward_std": 0.30477777123451233, "rewards/fixed_code_pass_all_test_reward/mean": 0.494047611951828, "rewards/fixed_code_pass_all_test_reward/std": 0.3047778308391571, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 408.625, "completions/mean_terminated_length": 408.625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.3486441615938019, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.058437996078282595, "learning_rate": 1.647217007624026e-05, "loss": 0.0023, "num_tokens": 14821197.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 148.625, "completions/mean_terminated_length": 148.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.348828629404169, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "kl": 0.07773642195388675, "learning_rate": 1.6467259233857186e-05, "loss": 0.0031, "num_tokens": 14825490.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.34901309721453605, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0726718872319907, "learning_rate": 1.646234570899463e-05, "loss": 0.0029, "num_tokens": 14832758.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 102.5, "completions/mean_terminated_length": 102.5, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.3491975650249032, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.12579491175711155, "learning_rate": 1.6457429503690624e-05, "loss": 0.005, "num_tokens": 14836298.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 216.875, "completions/mean_terminated_length": 216.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.34938203283527025, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.08530666213482618, "learning_rate": 1.64525106199843e-05, "loss": 0.0034, "num_tokens": 14845937.0, "reward": 1.5760869979858398, "reward_std": 0.23327097296714783, "rewards/fixed_code_pass_all_test_reward/mean": 0.5760869979858398, "rewards/fixed_code_pass_all_test_reward/std": 0.23327097296714783, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.3495665006456373, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.05650617368519306, "learning_rate": 1.6447589059915907e-05, "loss": 0.0023, "num_tokens": 14852154.0, "reward": 1.3988094329833984, "reward_std": 0.45314115285873413, "rewards/fixed_code_pass_all_test_reward/mean": 0.3988095223903656, "rewards/fixed_code_pass_all_test_reward/std": 0.45314112305641174, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.34975096845600445, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.04898400674574077, "learning_rate": 1.6442664825526797e-05, "loss": 0.002, "num_tokens": 14859226.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.3499354362663715, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.11337560974061489, "learning_rate": 1.6437737918859445e-05, "loss": 0.0045, "num_tokens": 14867125.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3501199040767386, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.050949950935319066, "learning_rate": 1.6432808341957414e-05, "loss": 0.002, "num_tokens": 14873096.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 443.5, "completions/mean_terminated_length": 443.5, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.3503043718871057, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.06160272937268019, "learning_rate": 1.6427876096865394e-05, "loss": 0.0025, "num_tokens": 14882500.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 109.5, "completions/mean_terminated_length": 109.5, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.3504888396974728, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.17130080610513687, "learning_rate": 1.642294118562917e-05, "loss": 0.0069, "num_tokens": 14890312.0, "reward": 1.8897058963775635, "reward_std": 0.31195884943008423, "rewards/fixed_code_pass_all_test_reward/mean": 0.8897058963775635, "rewards/fixed_code_pass_all_test_reward/std": 0.3119588792324066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.35067330750783987, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.06141763459891081, "learning_rate": 1.641800361029564e-05, "loss": 0.0025, "num_tokens": 14897815.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 113.125, "completions/mean_terminated_length": 113.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.350857775318207, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "kl": 0.0973310659173876, "learning_rate": 1.6413063372912807e-05, "loss": 0.0039, "num_tokens": 14901544.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.35104224312857407, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0545880903955549, "learning_rate": 1.6408120475529763e-05, "loss": 0.0022, "num_tokens": 14908723.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.35122671093894114, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.08055225526914, "learning_rate": 1.6403174920196724e-05, "loss": 0.0032, "num_tokens": 14918309.0, "reward": 1.9272727966308594, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9272727370262146, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3514111787493083, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.08524278737604618, "learning_rate": 1.6398226708964994e-05, "loss": 0.0034, "num_tokens": 14926633.0, "reward": 1.9558823108673096, "reward_std": 0.12478352338075638, "rewards/fixed_code_pass_all_test_reward/mean": 0.9558823704719543, "rewards/fixed_code_pass_all_test_reward/std": 0.12478354573249817, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.35159564655967535, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.04821671545505524, "learning_rate": 1.639327584388699e-05, "loss": 0.0019, "num_tokens": 14931186.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.3517801143700424, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.07380560273304582, "learning_rate": 1.6388322327016223e-05, "loss": 0.003, "num_tokens": 14936719.0, "reward": 1.3125, "reward_std": 0.058925580233335495, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3519645821804095, "frac_reward_zero_std": 0.0, "grad_norm": 3.578125, "kl": 0.08559843432158232, "learning_rate": 1.6383366160407305e-05, "loss": 0.0034, "num_tokens": 14943885.0, "reward": 1.8897058963775635, "reward_std": 0.31195884943008423, "rewards/fixed_code_pass_all_test_reward/mean": 0.8897058963775635, "rewards/fixed_code_pass_all_test_reward/std": 0.3119588792324066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 563.875, "completions/mean_terminated_length": 563.875, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.3521490499907766, "frac_reward_zero_std": 0.0, "grad_norm": 0.90234375, "kl": 0.026124339317902923, "learning_rate": 1.637840734611594e-05, "loss": 0.001, "num_tokens": 14959012.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.3523335178011437, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.06862498680129647, "learning_rate": 1.6373445886198946e-05, "loss": 0.0027, "num_tokens": 14964710.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.35251798561151076, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.050481501035392284, "learning_rate": 1.636848178271422e-05, "loss": 0.002, "num_tokens": 14971274.0, "reward": 1.8922414779663086, "reward_std": 0.24963875114917755, "rewards/fixed_code_pass_all_test_reward/mean": 0.892241358757019, "rewards/fixed_code_pass_all_test_reward/std": 0.24963876605033875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 440.125, "completions/mean_terminated_length": 440.125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.3527024534218779, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.06756604835391045, "learning_rate": 1.6363515037720774e-05, "loss": 0.0027, "num_tokens": 14983723.0, "reward": 0.8839285373687744, "reward_std": 0.35803458094596863, "rewards/fixed_code_pass_all_test_reward/mean": 0.008928571827709675, "rewards/fixed_code_pass_all_test_reward/std": 0.025253813713788986, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.35288692123224497, "frac_reward_zero_std": 0.0, "grad_norm": 3.5625, "kl": 0.09804554283618927, "learning_rate": 1.6358545653278707e-05, "loss": 0.0039, "num_tokens": 14988620.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 547.25, "completions/mean_terminated_length": 547.25, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.35307138904261204, "frac_reward_zero_std": 0.0, "grad_norm": 0.62109375, "kl": 0.031994852353818715, "learning_rate": 1.6353573631449203e-05, "loss": 0.0013, "num_tokens": 15000510.0, "reward": 1.6307692527770996, "reward_std": 0.6827422380447388, "rewards/fixed_code_pass_all_test_reward/mean": 0.7557692527770996, "rewards/fixed_code_pass_all_test_reward/std": 0.3538402020931244, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 345.875, "completions/mean_terminated_length": 345.875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.35325585685297917, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.034319449216127396, "learning_rate": 1.6348598974294558e-05, "loss": 0.0014, "num_tokens": 15006997.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.35344032466334624, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.07073309551924467, "learning_rate": 1.6343621683878147e-05, "loss": 0.0028, "num_tokens": 15015381.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.3536247924737133, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0676805549301207, "learning_rate": 1.6338641762264443e-05, "loss": 0.0027, "num_tokens": 15023664.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.35380926028408044, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04486285406164825, "learning_rate": 1.6333659211519015e-05, "loss": 0.0018, "num_tokens": 15029770.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.3539937280944475, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.07659539487212896, "learning_rate": 1.6328674033708517e-05, "loss": 0.0031, "num_tokens": 15039140.0, "reward": 1.6133720874786377, "reward_std": 0.15622125566005707, "rewards/fixed_code_pass_all_test_reward/mean": 0.6133720874786377, "rewards/fixed_code_pass_all_test_reward/std": 0.15622125566005707, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 399.875, "completions/mean_terminated_length": 399.875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.3541781959048146, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.05647521512582898, "learning_rate": 1.6323686230900686e-05, "loss": 0.0023, "num_tokens": 15052435.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3543626637151817, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06266296003013849, "learning_rate": 1.631869580516436e-05, "loss": 0.0025, "num_tokens": 15058911.0, "reward": 1.4236111640930176, "reward_std": 0.26383668184280396, "rewards/fixed_code_pass_all_test_reward/mean": 0.4236111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.26383668184280396, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 257.875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.3545471315255488, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.07283595018088818, "learning_rate": 1.6313702758569458e-05, "loss": 0.0029, "num_tokens": 15069366.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 202.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.35473159933591586, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.13896444439888, "learning_rate": 1.630870709318699e-05, "loss": 0.0056, "num_tokens": 15077794.0, "reward": 1.5875000953674316, "reward_std": 0.1642080396413803, "rewards/fixed_code_pass_all_test_reward/mean": 0.5874999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.1642080545425415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 248.375, "completions/mean_terminated_length": 248.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.354916067146283, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.05703976028598845, "learning_rate": 1.6303708811089053e-05, "loss": 0.0023, "num_tokens": 15087173.0, "reward": 1.5919811725616455, "reward_std": 0.1653924435377121, "rewards/fixed_code_pass_all_test_reward/mean": 0.5919811129570007, "rewards/fixed_code_pass_all_test_reward/std": 0.16539248824119568, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.35510053495665006, "frac_reward_zero_std": 1.0, "grad_norm": 1.40625, "kl": 0.18558701779693365, "learning_rate": 1.6298707914348817e-05, "loss": 0.0074, "num_tokens": 15090708.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.35528500276701713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.07324095675721765, "learning_rate": 1.6293704405040554e-05, "loss": 0.0029, "num_tokens": 15100736.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 644.875, "completions/mean_terminated_length": 644.875, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.35546947057738426, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.040818021865561604, "learning_rate": 1.6288698285239605e-05, "loss": 0.0016, "num_tokens": 15111335.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.35565393838775133, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.05636401358060539, "learning_rate": 1.628368955702241e-05, "loss": 0.0023, "num_tokens": 15115080.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 320.875, "completions/mean_terminated_length": 320.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3558384061981184, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.09004059759899974, "learning_rate": 1.6278678222466475e-05, "loss": 0.0036, "num_tokens": 15126727.0, "reward": 1.765625, "reward_std": 0.5686665773391724, "rewards/fixed_code_pass_all_test_reward/mean": 0.890625, "rewards/fixed_code_pass_all_test_reward/std": 0.2259652018547058, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 409.75, "completions/mean_terminated_length": 409.75, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.35602287400848553, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.07080508931539953, "learning_rate": 1.6273664283650393e-05, "loss": 0.0028, "num_tokens": 15138157.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 306.625, "completions/mean_terminated_length": 306.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.3562073418188526, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.06530932476744056, "learning_rate": 1.6268647742653837e-05, "loss": 0.0026, "num_tokens": 15145154.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 511.0, "completions/mean_terminated_length": 511.0, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.3563918096292197, "frac_reward_zero_std": 0.0, "grad_norm": 7.0, "kl": 0.6969900229014456, "learning_rate": 1.6263628601557562e-05, "loss": 0.0279, "num_tokens": 15153658.0, "reward": 1.5909091234207153, "reward_std": 0.45842501521110535, "rewards/fixed_code_pass_all_test_reward/mean": 0.7159091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.40782630443573, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 200.875, "completions/mean_terminated_length": 200.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.3565762774395868, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.08354019513353705, "learning_rate": 1.62586068624434e-05, "loss": 0.0033, "num_tokens": 15158945.0, "reward": 1.558333396911621, "reward_std": 0.1178511381149292, "rewards/fixed_code_pass_all_test_reward/mean": 0.5583333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 453.125, "completions/mean_terminated_length": 453.125, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.3567607452499539, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.04428578692022711, "learning_rate": 1.625358252739426e-05, "loss": 0.0018, "num_tokens": 15168122.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.35694521306032095, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.11605306901037693, "learning_rate": 1.6248555598494123e-05, "loss": 0.0046, "num_tokens": 15175380.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 567.5, "completions/mean_terminated_length": 567.5, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.3571296808706881, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.04776442190632224, "learning_rate": 1.624352607782806e-05, "loss": 0.0019, "num_tokens": 15188912.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.35731414868105515, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.15056443377397954, "learning_rate": 1.62384939674822e-05, "loss": 0.006, "num_tokens": 15193214.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.3574986164914222, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.06814129906706512, "learning_rate": 1.6233459269543754e-05, "loss": 0.0027, "num_tokens": 15201427.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.35768308430178936, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.10369215114042163, "learning_rate": 1.6228421986101005e-05, "loss": 0.0041, "num_tokens": 15210300.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 286.125, "completions/mean_terminated_length": 286.125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.35786755211215643, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.04109000333119184, "learning_rate": 1.6223382119243315e-05, "loss": 0.0016, "num_tokens": 15216157.0, "reward": 1.8958332538604736, "reward_std": 0.294627845287323, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 321.0, "completions/mean_terminated_length": 321.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3580520199225235, "frac_reward_zero_std": 1.0, "grad_norm": 0.46875, "kl": 0.07777411257848144, "learning_rate": 1.621833967106111e-05, "loss": 0.0031, "num_tokens": 15222053.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.35823648773289063, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.08887311210855842, "learning_rate": 1.6213294643645886e-05, "loss": 0.0036, "num_tokens": 15232395.0, "reward": 1.0703125, "reward_std": 0.5463326573371887, "rewards/fixed_code_pass_all_test_reward/mean": 0.1953125, "rewards/fixed_code_pass_all_test_reward/std": 0.34303903579711914, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.3584209555432577, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.11440791934728622, "learning_rate": 1.620824703909021e-05, "loss": 0.0046, "num_tokens": 15238623.0, "reward": 1.4090908765792847, "reward_std": 0.730512261390686, "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.5060146450996399, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 226.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.3586054233536248, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.07157699950039387, "learning_rate": 1.6203196859487723e-05, "loss": 0.0029, "num_tokens": 15244530.0, "reward": 1.44140625, "reward_std": 0.7119118571281433, "rewards/fixed_code_pass_all_test_reward/mean": 0.56640625, "rewards/fixed_code_pass_all_test_reward/std": 0.46902894973754883, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.3587898911639919, "frac_reward_zero_std": 1.0, "grad_norm": 1.3125, "kl": 0.1674301396124065, "learning_rate": 1.6198144106933133e-05, "loss": 0.0067, "num_tokens": 15249563.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 370.75, "completions/mean_terminated_length": 370.75, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.358974358974359, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.051582859829068184, "learning_rate": 1.6193088783522206e-05, "loss": 0.0021, "num_tokens": 15256617.0, "reward": 1.7083332538604736, "reward_std": 0.5072228908538818, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.23002184927463531, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 366.125, "completions/mean_terminated_length": 366.125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.35915882678472605, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.07406810065731406, "learning_rate": 1.6188030891351785e-05, "loss": 0.003, "num_tokens": 15263138.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.3593432945950932, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.10535938804969192, "learning_rate": 1.6182970432519772e-05, "loss": 0.0042, "num_tokens": 15268760.0, "reward": 1.5043103694915771, "reward_std": 0.3347987234592438, "rewards/fixed_code_pass_all_test_reward/mean": 0.5043103098869324, "rewards/fixed_code_pass_all_test_reward/std": 0.33479875326156616, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 193.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.35952776240546025, "frac_reward_zero_std": 1.0, "grad_norm": 0.291015625, "kl": 0.07548337848857045, "learning_rate": 1.6177907409125142e-05, "loss": 0.003, "num_tokens": 15273229.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 226.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.3597122302158273, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.07697132509201765, "learning_rate": 1.617284182326792e-05, "loss": 0.0031, "num_tokens": 15278213.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 228.75, "completions/mean_terminated_length": 228.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.35989669802619445, "frac_reward_zero_std": 1.0, "grad_norm": 0.162109375, "kl": 0.07849190849810839, "learning_rate": 1.6167773677049207e-05, "loss": 0.0031, "num_tokens": 15285811.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.3600811658365615, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.07265546638518572, "learning_rate": 1.616270297257116e-05, "loss": 0.0029, "num_tokens": 15292474.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 327.5, "completions/mean_terminated_length": 327.5, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.3602656336469286, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.03659767744829878, "learning_rate": 1.6157629711936995e-05, "loss": 0.0015, "num_tokens": 15298622.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 325.125, "completions/mean_terminated_length": 325.125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.3604501014572957, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.06031824601814151, "learning_rate": 1.615255389725099e-05, "loss": 0.0024, "num_tokens": 15305351.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.3606345692676628, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.06347426841966808, "learning_rate": 1.614747553061849e-05, "loss": 0.0025, "num_tokens": 15311191.0, "reward": 1.6938775777816772, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6938775777816772, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 247.25, "completions/mean_terminated_length": 247.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.36081903707802987, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.05881877918727696, "learning_rate": 1.6142394614145884e-05, "loss": 0.0024, "num_tokens": 15320177.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 242.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.361003504888397, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.0754662835970521, "learning_rate": 1.6137311149940633e-05, "loss": 0.003, "num_tokens": 15330217.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 450.375, "completions/mean_terminated_length": 450.375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.36118797269876407, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.031044627190567553, "learning_rate": 1.6132225140111243e-05, "loss": 0.0012, "num_tokens": 15338300.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 438.375, "completions/mean_terminated_length": 438.375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.36137244050913114, "frac_reward_zero_std": 1.0, "grad_norm": 0.060791015625, "kl": 0.040899205254390836, "learning_rate": 1.6127136586767284e-05, "loss": 0.0016, "num_tokens": 15350455.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 562.875, "completions/mean_terminated_length": 562.875, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.36155690831949827, "frac_reward_zero_std": 1.0, "grad_norm": 0.03173828125, "kl": 0.022222058847546577, "learning_rate": 1.6122045492019374e-05, "loss": 0.0009, "num_tokens": 15364030.0, "reward": 1.9506173133850098, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9506173133850098, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.36174137612986534, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.04695689305663109, "learning_rate": 1.61169518579792e-05, "loss": 0.0019, "num_tokens": 15371007.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.3619258439402324, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.0885339304804802, "learning_rate": 1.6111855686759478e-05, "loss": 0.0035, "num_tokens": 15380053.0, "reward": 1.2276785373687744, "reward_std": 0.11602241545915604, "rewards/fixed_code_pass_all_test_reward/mean": 0.2276785671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.11602236330509186, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.36211031175059955, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.06802119128406048, "learning_rate": 1.6106756980473995e-05, "loss": 0.0027, "num_tokens": 15389551.0, "reward": 1.5591216087341309, "reward_std": 0.24419990181922913, "rewards/fixed_code_pass_all_test_reward/mean": 0.5591216087341309, "rewards/fixed_code_pass_all_test_reward/std": 0.24419990181922913, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 460.875, "completions/mean_terminated_length": 460.875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.3622947795609666, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.051183353178203106, "learning_rate": 1.610165574123759e-05, "loss": 0.002, "num_tokens": 15397494.0, "reward": 1.75, "reward_std": 0.28943783044815063, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.28943780064582825, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.3624792473713337, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.06448211311362684, "learning_rate": 1.6096551971166138e-05, "loss": 0.0026, "num_tokens": 15402014.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3626637151817008, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.0883217363152653, "learning_rate": 1.609144567237658e-05, "loss": 0.0035, "num_tokens": 15413515.0, "reward": 1.4672620296478271, "reward_std": 0.1838505119085312, "rewards/fixed_code_pass_all_test_reward/mean": 0.4672618806362152, "rewards/fixed_code_pass_all_test_reward/std": 0.18385054171085358, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 393.125, "completions/mean_terminated_length": 393.125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.3628481829920679, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.057407434564083815, "learning_rate": 1.6086336846986893e-05, "loss": 0.0023, "num_tokens": 15421884.0, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 432.25, "completions/mean_terminated_length": 432.25, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.36303265080243496, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.06289669033139944, "learning_rate": 1.6081225497116108e-05, "loss": 0.0025, "num_tokens": 15434430.0, "reward": 1.5762711763381958, "reward_std": 0.33410534262657166, "rewards/fixed_code_pass_all_test_reward/mean": 0.5762711763381958, "rewards/fixed_code_pass_all_test_reward/std": 0.33410534262657166, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.3632171186128021, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.06418906385079026, "learning_rate": 1.6076111624884303e-05, "loss": 0.0026, "num_tokens": 15439826.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 278.375, "completions/mean_terminated_length": 278.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.36340158642316917, "frac_reward_zero_std": 1.0, "grad_norm": 0.061279296875, "kl": 0.04396837146487087, "learning_rate": 1.6070995232412605e-05, "loss": 0.0018, "num_tokens": 15448317.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.36358605423353624, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.07388905016705394, "learning_rate": 1.606587632182318e-05, "loss": 0.003, "num_tokens": 15459310.0, "reward": 1.8143939971923828, "reward_std": 0.34491392970085144, "rewards/fixed_code_pass_all_test_reward/mean": 0.8143939971923828, "rewards/fixed_code_pass_all_test_reward/std": 0.34491395950317383, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 164.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.36377052204390337, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.062206740491092205, "learning_rate": 1.6060754895239244e-05, "loss": 0.0025, "num_tokens": 15464425.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.36395498985427044, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.09329703077673912, "learning_rate": 1.605563095478505e-05, "loss": 0.0037, "num_tokens": 15472515.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.3641394576646375, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.09709947509691119, "learning_rate": 1.60505045025859e-05, "loss": 0.0039, "num_tokens": 15478767.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 330.375, "completions/mean_terminated_length": 330.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3643239254750046, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.057204480515792966, "learning_rate": 1.6045375540768136e-05, "loss": 0.0023, "num_tokens": 15488250.0, "reward": 1.0446429252624512, "reward_std": 0.025253813713788986, "rewards/fixed_code_pass_all_test_reward/mean": 0.0446428582072258, "rewards/fixed_code_pass_all_test_reward/std": 0.025253813713788986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.3645083932853717, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.05190826579928398, "learning_rate": 1.6040244071459145e-05, "loss": 0.0021, "num_tokens": 15497428.0, "reward": 1.110119104385376, "reward_std": 0.16044636070728302, "rewards/fixed_code_pass_all_test_reward/mean": 0.1101190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.16044636070728302, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 470.0, "completions/mean_terminated_length": 470.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.3646928610957388, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.04852114827372134, "learning_rate": 1.603511009678734e-05, "loss": 0.0019, "num_tokens": 15504948.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.36487732890610586, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.06563987908884883, "learning_rate": 1.6029973618882187e-05, "loss": 0.0026, "num_tokens": 15513109.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 308.0, "completions/mean_terminated_length": 308.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.365061796716473, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.08761126268655062, "learning_rate": 1.6024834639874193e-05, "loss": 0.0035, "num_tokens": 15523581.0, "reward": 1.2159091234207153, "reward_std": 0.13690367341041565, "rewards/fixed_code_pass_all_test_reward/mean": 0.21590909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.13690370321273804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 447.875, "completions/mean_terminated_length": 447.875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.36524626452684006, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.02594244130887091, "learning_rate": 1.601969316189489e-05, "loss": 0.001, "num_tokens": 15535092.0, "reward": 1.75, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.36543073233720713, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.05951994098722935, "learning_rate": 1.6014549187076847e-05, "loss": 0.0024, "num_tokens": 15540271.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 405.875, "completions/mean_terminated_length": 405.875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.36561520014757426, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.024319962365552783, "learning_rate": 1.600940271755368e-05, "loss": 0.001, "num_tokens": 15547814.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 164.375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.36579966795794133, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.07073280168697238, "learning_rate": 1.6004253755460032e-05, "loss": 0.0028, "num_tokens": 15551865.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3659841357683084, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.082780153141357, "learning_rate": 1.5999102302931585e-05, "loss": 0.0033, "num_tokens": 15557943.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 438.75, "completions/mean_terminated_length": 438.75, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.36616860357867553, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.057099258061498404, "learning_rate": 1.5993948362105044e-05, "loss": 0.0023, "num_tokens": 15571021.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.3663530713890426, "frac_reward_zero_std": 1.0, "grad_norm": 0.22265625, "kl": 0.06515619484707713, "learning_rate": 1.5988791935118154e-05, "loss": 0.0026, "num_tokens": 15578501.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3665375391994097, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.04087330191396177, "learning_rate": 1.598363302410969e-05, "loss": 0.0016, "num_tokens": 15588301.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 506.75, "completions/mean_terminated_length": 506.75, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.3667220070097768, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.04536524345166981, "learning_rate": 1.5978471631219456e-05, "loss": 0.0018, "num_tokens": 15603051.0, "reward": 1.796875, "reward_std": 0.17278572916984558, "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, "rewards/fixed_code_pass_all_test_reward/std": 0.17278574407100677, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.3669064748201439, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.05325540737248957, "learning_rate": 1.5973307758588292e-05, "loss": 0.0021, "num_tokens": 15608066.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.36709094263051095, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.056364082265645266, "learning_rate": 1.5968141408358053e-05, "loss": 0.0023, "num_tokens": 15612222.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 347.25, "completions/mean_terminated_length": 347.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.3672754104408781, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.0660260608419776, "learning_rate": 1.5962972582671633e-05, "loss": 0.0026, "num_tokens": 15620944.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.36745987825124515, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.07427097018808126, "learning_rate": 1.595780128367295e-05, "loss": 0.003, "num_tokens": 15626409.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.3676443460616122, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.047056706389412284, "learning_rate": 1.595262751350695e-05, "loss": 0.0019, "num_tokens": 15633902.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.36782881387197935, "frac_reward_zero_std": 1.0, "grad_norm": 0.24609375, "kl": 0.08007701393216848, "learning_rate": 1.594745127431959e-05, "loss": 0.0032, "num_tokens": 15642545.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.3680132816823464, "frac_reward_zero_std": 0.0, "grad_norm": 3.21875, "kl": 0.18081613071262836, "learning_rate": 1.5942272568257875e-05, "loss": 0.0072, "num_tokens": 15649775.0, "reward": 1.4375, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.3681977494927135, "frac_reward_zero_std": 1.0, "grad_norm": 0.734375, "kl": 0.11787127470597625, "learning_rate": 1.5937091397469816e-05, "loss": 0.0047, "num_tokens": 15657673.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.36838221730308063, "frac_reward_zero_std": 1.0, "grad_norm": 0.35546875, "kl": 0.07055616844445467, "learning_rate": 1.593190776410445e-05, "loss": 0.0028, "num_tokens": 15665226.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3685666851134477, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.06326983496546745, "learning_rate": 1.592672167031183e-05, "loss": 0.0025, "num_tokens": 15669699.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 247.875, "completions/mean_terminated_length": 247.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.3687511529238148, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.18244908563792706, "learning_rate": 1.592153311824305e-05, "loss": 0.0073, "num_tokens": 15675906.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 508.75, "completions/mean_terminated_length": 508.75, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.3689356207341819, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.03532798239029944, "learning_rate": 1.5916342110050197e-05, "loss": 0.0014, "num_tokens": 15685824.0, "reward": 1.5989583730697632, "reward_std": 0.24744480848312378, "rewards/fixed_code_pass_all_test_reward/mean": 0.5989583730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.24744479358196259, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 311.125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.369120088544549, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.03856912418268621, "learning_rate": 1.5911148647886393e-05, "loss": 0.0015, "num_tokens": 15691777.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 375.0, "completions/mean_terminated_length": 375.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.36930455635491605, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.05231935530900955, "learning_rate": 1.5905952733905777e-05, "loss": 0.0021, "num_tokens": 15699305.0, "reward": 1.298387050628662, "reward_std": 0.4330449104309082, "rewards/fixed_code_pass_all_test_reward/mean": 0.2983870804309845, "rewards/fixed_code_pass_all_test_reward/std": 0.4330449402332306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 383.375, "completions/mean_terminated_length": 383.375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.3694890241652832, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.06140609132125974, "learning_rate": 1.5900754370263495e-05, "loss": 0.0025, "num_tokens": 15710060.0, "reward": 1.8250000476837158, "reward_std": 0.3284160792827606, "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.328416109085083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.36967349197565025, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.05851713567972183, "learning_rate": 1.5895553559115724e-05, "loss": 0.0023, "num_tokens": 15721914.0, "reward": 1.4886362552642822, "reward_std": 0.3156205117702484, "rewards/fixed_code_pass_all_test_reward/mean": 0.4886363446712494, "rewards/fixed_code_pass_all_test_reward/std": 0.315620481967926, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 726.625, "completions/mean_terminated_length": 726.625, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.3698579597860173, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.03720540972426534, "learning_rate": 1.5890350302619643e-05, "loss": 0.0015, "num_tokens": 15733223.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.37004242759638445, "frac_reward_zero_std": 1.0, "grad_norm": 0.50390625, "kl": 0.10310816252604127, "learning_rate": 1.5885144602933453e-05, "loss": 0.0041, "num_tokens": 15742609.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 336.25, "completions/mean_terminated_length": 336.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3702268954067515, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.061628656927496195, "learning_rate": 1.5879936462216364e-05, "loss": 0.0025, "num_tokens": 15751211.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.3704113632171186, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.10422086296603084, "learning_rate": 1.58747258826286e-05, "loss": 0.0042, "num_tokens": 15760612.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 304.0, "completions/mean_terminated_length": 304.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3705958310274857, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.05778342252597213, "learning_rate": 1.58695128663314e-05, "loss": 0.0023, "num_tokens": 15769668.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.3707802988378528, "frac_reward_zero_std": 0.0, "grad_norm": 3.296875, "kl": 0.09463334223255515, "learning_rate": 1.5864297415487006e-05, "loss": 0.0038, "num_tokens": 15776945.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 185.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.37096476664821987, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.052100331755355, "learning_rate": 1.585907953225868e-05, "loss": 0.0021, "num_tokens": 15784663.0, "reward": 1.6666667461395264, "reward_std": 0.13468700647354126, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.13468700647354126, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.371149234458587, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.07944595301523805, "learning_rate": 1.5853859218810683e-05, "loss": 0.0032, "num_tokens": 15794160.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 354.25, "completions/mean_terminated_length": 354.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.37133370226895407, "frac_reward_zero_std": 1.0, "grad_norm": 0.57421875, "kl": 0.07848027488216758, "learning_rate": 1.5848636477308285e-05, "loss": 0.0031, "num_tokens": 15800650.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.37151817007932114, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.10067172441631556, "learning_rate": 1.5843411309917773e-05, "loss": 0.004, "num_tokens": 15810959.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.37170263788968827, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.05117000313475728, "learning_rate": 1.5838183718806434e-05, "loss": 0.002, "num_tokens": 15815317.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 392.25, "completions/mean_terminated_length": 392.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.37188710570005534, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.042787213111296296, "learning_rate": 1.5832953706142556e-05, "loss": 0.0017, "num_tokens": 15822919.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.3720715735104224, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.0659822840243578, "learning_rate": 1.582772127409544e-05, "loss": 0.0026, "num_tokens": 15829323.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.37225604132078954, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.044839849695563316, "learning_rate": 1.5822486424835377e-05, "loss": 0.0018, "num_tokens": 15836199.0, "reward": 1.75, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 307.75, "completions/mean_terminated_length": 307.75, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.3724405091311566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0439453125, "kl": 0.03441230161115527, "learning_rate": 1.5817249160533678e-05, "loss": 0.0014, "num_tokens": 15844845.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.3726249769415237, "frac_reward_zero_std": 1.0, "grad_norm": 0.2890625, "kl": 0.053307154681533575, "learning_rate": 1.5812009483362643e-05, "loss": 0.0021, "num_tokens": 15850823.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 434.875, "completions/mean_terminated_length": 434.875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.3728094447518908, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.03862924687564373, "learning_rate": 1.5806767395495576e-05, "loss": 0.0015, "num_tokens": 15859590.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 348.0, "completions/mean_terminated_length": 348.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.3729939125622579, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.03247361700050533, "learning_rate": 1.5801522899106786e-05, "loss": 0.0013, "num_tokens": 15869366.0, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.37317838037262496, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.06877316953614354, "learning_rate": 1.5796275996371576e-05, "loss": 0.0028, "num_tokens": 15878716.0, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.41052016615867615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 232.875, "completions/mean_terminated_length": 232.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3733628481829921, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.08669041283428669, "learning_rate": 1.5791026689466248e-05, "loss": 0.0035, "num_tokens": 15883643.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.37354731599335916, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.08197324490174651, "learning_rate": 1.5785774980568094e-05, "loss": 0.0033, "num_tokens": 15893771.0, "reward": 1.4027777910232544, "reward_std": 0.35105034708976746, "rewards/fixed_code_pass_all_test_reward/mean": 0.5277777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.07856742292642593, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 174.5, "completions/mean_terminated_length": 174.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.37373178380372624, "frac_reward_zero_std": 0.0, "grad_norm": 4.09375, "kl": 0.0910192346200347, "learning_rate": 1.5780520871855418e-05, "loss": 0.0036, "num_tokens": 15898143.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.37391625161409336, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.059741366654634476, "learning_rate": 1.577526436550751e-05, "loss": 0.0024, "num_tokens": 15909919.0, "reward": 1.8806817531585693, "reward_std": 0.3374828100204468, "rewards/fixed_code_pass_all_test_reward/mean": 0.8806818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.3374827802181244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 202.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.37410071942446044, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0588384591974318, "learning_rate": 1.577000546370465e-05, "loss": 0.0024, "num_tokens": 15915643.0, "reward": 1.746323585510254, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.7463235259056091, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 369.0, "completions/mean_terminated_length": 369.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.3742851872348275, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.11099007353186607, "learning_rate": 1.576474416862812e-05, "loss": 0.0044, "num_tokens": 15923475.0, "reward": 0.14772728085517883, "reward_std": 0.41783586144447327, "rewards/fixed_code_pass_all_test_reward/mean": 0.022727273404598236, "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.37446965504519464, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.04798130341805518, "learning_rate": 1.5759480482460198e-05, "loss": 0.0019, "num_tokens": 15928564.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.3746541228555617, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.09226721432060003, "learning_rate": 1.5754214407384136e-05, "loss": 0.0037, "num_tokens": 15933094.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.3748385906659288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.04596186277922243, "learning_rate": 1.5748945945584195e-05, "loss": 0.0018, "num_tokens": 15937688.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 317.875, "completions/mean_terminated_length": 317.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.3750230584762959, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.1208884185180068, "learning_rate": 1.5743675099245615e-05, "loss": 0.0048, "num_tokens": 15948519.0, "reward": 1.6041666269302368, "reward_std": 0.32180801033973694, "rewards/fixed_code_pass_all_test_reward/mean": 0.6041666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3218080401420593, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.375207526286663, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.05316197592765093, "learning_rate": 1.5738401870554633e-05, "loss": 0.0021, "num_tokens": 15958852.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.37539199409703006, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.037740341387689114, "learning_rate": 1.5733126261698474e-05, "loss": 0.0015, "num_tokens": 15962902.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.3755764619073972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.044040365144610405, "learning_rate": 1.5727848274865336e-05, "loss": 0.0018, "num_tokens": 15968810.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 344.75, "completions/mean_terminated_length": 344.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.37576092971776426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.0334241179516539, "learning_rate": 1.5722567912244425e-05, "loss": 0.0013, "num_tokens": 15976288.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.37594539752813133, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.03826383489649743, "learning_rate": 1.5717285176025913e-05, "loss": 0.0015, "num_tokens": 15982646.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 201.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.37612986533849846, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.04680706141516566, "learning_rate": 1.5712000068400968e-05, "loss": 0.0019, "num_tokens": 15987108.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.37631433314886553, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.06196284294128418, "learning_rate": 1.570671259156174e-05, "loss": 0.0025, "num_tokens": 15990821.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.3764988009592326, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.07945261057466269, "learning_rate": 1.5701422747701358e-05, "loss": 0.0032, "num_tokens": 15999530.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3766832687695997, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.06810652907006443, "learning_rate": 1.5696130539013938e-05, "loss": 0.0027, "num_tokens": 16005511.0, "reward": 1.923076868057251, "reward_std": 0.0822342112660408, "rewards/fixed_code_pass_all_test_reward/mean": 0.9230769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.082234226167202, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.3768677365799668, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.08281904086470604, "learning_rate": 1.569083596769457e-05, "loss": 0.0033, "num_tokens": 16014995.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3770522043903339, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.039036361034959555, "learning_rate": 1.568553903593933e-05, "loss": 0.0016, "num_tokens": 16021653.0, "reward": 1.5726743936538696, "reward_std": 0.17266559600830078, "rewards/fixed_code_pass_all_test_reward/mean": 0.5726743936538696, "rewards/fixed_code_pass_all_test_reward/std": 0.17266561090946198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 370.375, "completions/mean_terminated_length": 370.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.37723667220070095, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0390088795684278, "learning_rate": 1.568023974594527e-05, "loss": 0.0016, "num_tokens": 16031432.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.3774211400110681, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.08384355437010527, "learning_rate": 1.5674938099910425e-05, "loss": 0.0034, "num_tokens": 16040627.0, "reward": 1.8024194240570068, "reward_std": 0.34974879026412964, "rewards/fixed_code_pass_all_test_reward/mean": 0.8024193644523621, "rewards/fixed_code_pass_all_test_reward/std": 0.349748820066452, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 532.25, "completions/mean_terminated_length": 532.25, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.37760560782143515, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.03417499549686909, "learning_rate": 1.5669634100033798e-05, "loss": 0.0014, "num_tokens": 16055157.0, "reward": 1.7083332538604736, "reward_std": 0.3435344099998474, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.343534380197525, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3777900756318022, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.04946702439337969, "learning_rate": 1.566432774851537e-05, "loss": 0.002, "num_tokens": 16059655.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 208.5, "completions/mean_terminated_length": 208.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.37797454344216935, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.07648951536975801, "learning_rate": 1.5659019047556115e-05, "loss": 0.0031, "num_tokens": 16064931.0, "reward": 1.875, "reward_std": 0.1937432438135147, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.1937432438135147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 299.375, "completions/mean_terminated_length": 299.375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.3781590112525364, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.06417302042245865, "learning_rate": 1.5653707999357957e-05, "loss": 0.0026, "num_tokens": 16074414.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 507.75, "completions/mean_terminated_length": 507.75, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.3783434790629035, "frac_reward_zero_std": 1.0, "grad_norm": 0.12109375, "kl": 0.042284462600946426, "learning_rate": 1.5648394606123805e-05, "loss": 0.0017, "num_tokens": 16089548.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 365.375, "completions/mean_terminated_length": 365.375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.3785279468732706, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.05202594166621566, "learning_rate": 1.5643078870057538e-05, "loss": 0.0021, "num_tokens": 16097767.0, "reward": 1.662500023841858, "reward_std": 0.1060660183429718, "rewards/fixed_code_pass_all_test_reward/mean": 0.6625000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.1060660183429718, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.3787124146836377, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.05125638097524643, "learning_rate": 1.5637760793364013e-05, "loss": 0.0021, "num_tokens": 16103311.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 267.0, "completions/mean_terminated_length": 267.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.37889688249400477, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.06872270232997835, "learning_rate": 1.5632440378249053e-05, "loss": 0.0027, "num_tokens": 16111791.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 439.875, "completions/mean_terminated_length": 439.875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.3790813503043719, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.1027715727686882, "learning_rate": 1.562711762691945e-05, "loss": 0.0041, "num_tokens": 16123230.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.379265818114739, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.07924538804218173, "learning_rate": 1.5621792541582968e-05, "loss": 0.0032, "num_tokens": 16132443.0, "reward": 1.4107143878936768, "reward_std": 0.12123839557170868, "rewards/fixed_code_pass_all_test_reward/mean": 0.4107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.1212383583188057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.37945028592510605, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06621432583779097, "learning_rate": 1.5616465124448333e-05, "loss": 0.0026, "num_tokens": 16140352.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.3796347537354732, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "kl": 0.668693580199033, "learning_rate": 1.5611135377725252e-05, "loss": 0.0267, "num_tokens": 16150173.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 240.375, "completions/mean_terminated_length": 240.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.37981922154584025, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.06371494638733566, "learning_rate": 1.5605803303624375e-05, "loss": 0.0025, "num_tokens": 16154880.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.3800036893562073, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.05254642269574106, "learning_rate": 1.560046890435734e-05, "loss": 0.0021, "num_tokens": 16160742.0, "reward": 1.8365384340286255, "reward_std": 0.3401493728160858, "rewards/fixed_code_pass_all_test_reward/mean": 0.8365384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.3401494026184082, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 453.375, "completions/mean_terminated_length": 453.375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.38018815716657445, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.07008700189180672, "learning_rate": 1.5595132182136743e-05, "loss": 0.0028, "num_tokens": 16169609.0, "reward": 1.4544271230697632, "reward_std": 0.4185943603515625, "rewards/fixed_code_pass_all_test_reward/mean": 0.5794271230697632, "rewards/fixed_code_pass_all_test_reward/std": 0.304965615272522, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 416.875, "completions/mean_terminated_length": 416.875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.3803726249769415, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.06953431153669953, "learning_rate": 1.558979313917613e-05, "loss": 0.0028, "num_tokens": 16180120.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 416.375, "completions/mean_terminated_length": 416.375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.3805570927873086, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.046692407224327326, "learning_rate": 1.5584451777690035e-05, "loss": 0.0019, "num_tokens": 16189715.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 454.25, "completions/mean_terminated_length": 454.25, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.3807415605976757, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.046214085072278976, "learning_rate": 1.5579108099893933e-05, "loss": 0.0018, "num_tokens": 16202357.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 502.125, "completions/mean_terminated_length": 502.125, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.3809260284080428, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.05235015950165689, "learning_rate": 1.5573762108004262e-05, "loss": 0.0021, "num_tokens": 16215326.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.38111049621840987, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.12724767345935106, "learning_rate": 1.5568413804238427e-05, "loss": 0.0051, "num_tokens": 16223928.0, "reward": 1.4314515590667725, "reward_std": 0.4163084924221039, "rewards/fixed_code_pass_all_test_reward/mean": 0.43145161867141724, "rewards/fixed_code_pass_all_test_reward/std": 0.4163084924221039, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.381294964028777, "frac_reward_zero_std": 1.0, "grad_norm": 0.279296875, "kl": 0.05375446705147624, "learning_rate": 1.5563063190814794e-05, "loss": 0.0022, "num_tokens": 16230192.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.38147943183914407, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.11166008515283465, "learning_rate": 1.555771026995267e-05, "loss": 0.0045, "num_tokens": 16239006.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.38166389964951114, "frac_reward_zero_std": 1.0, "grad_norm": 0.380859375, "kl": 0.10377488005906343, "learning_rate": 1.5552355043872343e-05, "loss": 0.0042, "num_tokens": 16242925.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.38184836745987827, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.06018342520110309, "learning_rate": 1.5546997514795045e-05, "loss": 0.0024, "num_tokens": 16248551.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.38203283527024534, "frac_reward_zero_std": 0.0, "grad_norm": 3.359375, "kl": 0.10915123345330358, "learning_rate": 1.554163768494295e-05, "loss": 0.0044, "num_tokens": 16252824.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.3822173030806124, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.054645831463858485, "learning_rate": 1.5536275556539214e-05, "loss": 0.0022, "num_tokens": 16261945.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.38240177089097954, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.10541945928707719, "learning_rate": 1.5530911131807926e-05, "loss": 0.0042, "num_tokens": 16270267.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.3825862387013466, "frac_reward_zero_std": 0.0, "grad_norm": 3.953125, "kl": 0.0711380762513727, "learning_rate": 1.552554441297413e-05, "loss": 0.0028, "num_tokens": 16278208.0, "reward": 1.59375, "reward_std": 0.25074294209480286, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.25074294209480286, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3827707065117137, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.04896661383099854, "learning_rate": 1.5520175402263837e-05, "loss": 0.002, "num_tokens": 16285402.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.3829551743220808, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.05628492636606097, "learning_rate": 1.5514804101903983e-05, "loss": 0.0023, "num_tokens": 16292744.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.3831396421324479, "frac_reward_zero_std": 1.0, "grad_norm": 0.267578125, "kl": 0.08528974140062928, "learning_rate": 1.5509430514122476e-05, "loss": 0.0034, "num_tokens": 16303256.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.38332410994281496, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.05962030729278922, "learning_rate": 1.550405464114816e-05, "loss": 0.0024, "num_tokens": 16307913.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.3835085777531821, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.05935555463656783, "learning_rate": 1.5498676485210835e-05, "loss": 0.0024, "num_tokens": 16317069.0, "reward": 1.769230842590332, "reward_std": 0.4273015856742859, "rewards/fixed_code_pass_all_test_reward/mean": 0.7692307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.4273015856742859, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.38369304556354916, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.05784874456003308, "learning_rate": 1.549329604854124e-05, "loss": 0.0023, "num_tokens": 16324373.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.38387751337391623, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.0639146938920021, "learning_rate": 1.5487913333371063e-05, "loss": 0.0026, "num_tokens": 16332928.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.38406198118428336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.0593506945297122, "learning_rate": 1.5482528341932946e-05, "loss": 0.0024, "num_tokens": 16340332.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.38424644899465044, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.05221434868872166, "learning_rate": 1.547714107646046e-05, "loss": 0.0021, "num_tokens": 16348443.0, "reward": 1.6931817531585693, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3844309168050175, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.07640704186633229, "learning_rate": 1.5471751539188125e-05, "loss": 0.0031, "num_tokens": 16355195.0, "reward": 1.5025510787963867, "reward_std": 0.11796321719884872, "rewards/fixed_code_pass_all_test_reward/mean": 0.5025510191917419, "rewards/fixed_code_pass_all_test_reward/std": 0.1179632768034935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.38461538461538464, "frac_reward_zero_std": 1.0, "grad_norm": 0.162109375, "kl": 0.07053322717547417, "learning_rate": 1.546635973235141e-05, "loss": 0.0028, "num_tokens": 16360196.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 279.0, "completions/mean_terminated_length": 279.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.3847998524257517, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.10130987782031298, "learning_rate": 1.546096565818672e-05, "loss": 0.0041, "num_tokens": 16369732.0, "reward": 1.423387050628662, "reward_std": 0.7211962938308716, "rewards/fixed_code_pass_all_test_reward/mean": 0.6733871102333069, "rewards/fixed_code_pass_all_test_reward/std": 0.4547088146209717, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 391.625, "completions/mean_terminated_length": 391.625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.3849843202361188, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.032507042051292956, "learning_rate": 1.545556931893139e-05, "loss": 0.0013, "num_tokens": 16378025.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.3851687880464859, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.054513310780748725, "learning_rate": 1.5450170716823723e-05, "loss": 0.0022, "num_tokens": 16386875.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 217.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.385353255856853, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.033448591362684965, "learning_rate": 1.5444769854102928e-05, "loss": 0.0013, "num_tokens": 16392053.0, "reward": 1.453125, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.453125, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.38553772366722006, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.039566931896843016, "learning_rate": 1.5439366733009174e-05, "loss": 0.0016, "num_tokens": 16396452.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.3857221914775872, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.05237422767095268, "learning_rate": 1.5433961355783553e-05, "loss": 0.0021, "num_tokens": 16406366.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 409.875, "completions/mean_terminated_length": 409.875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.38590665928795426, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.0463007225189358, "learning_rate": 1.5428553724668103e-05, "loss": 0.0019, "num_tokens": 16415597.0, "reward": 1.5052083730697632, "reward_std": 0.4025510847568512, "rewards/fixed_code_pass_all_test_reward/mean": 0.6302083730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.2555800974369049, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.38609112709832133, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.08666781568899751, "learning_rate": 1.542314384190579e-05, "loss": 0.0035, "num_tokens": 16420951.0, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.38627559490868846, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.0665999969933182, "learning_rate": 1.541773170974052e-05, "loss": 0.0027, "num_tokens": 16429880.0, "reward": 1.4166666269302368, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 218.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.38646006271905553, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.10467825084924698, "learning_rate": 1.5412317330417122e-05, "loss": 0.0042, "num_tokens": 16438305.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.3866445305294226, "frac_reward_zero_std": 1.0, "grad_norm": 0.1826171875, "kl": 0.041750404285266995, "learning_rate": 1.5406900706181373e-05, "loss": 0.0017, "num_tokens": 16446180.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 168.625, "completions/mean_terminated_length": 168.625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.38682899833978973, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.06336347386240959, "learning_rate": 1.5401481839279966e-05, "loss": 0.0025, "num_tokens": 16451049.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.3870134661501568, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.04516768455505371, "learning_rate": 1.539606073196053e-05, "loss": 0.0018, "num_tokens": 16455034.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3871979339605239, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.03842973452992737, "learning_rate": 1.5390637386471626e-05, "loss": 0.0015, "num_tokens": 16463272.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.387382401770891, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.09008994232863188, "learning_rate": 1.538521180506274e-05, "loss": 0.0036, "num_tokens": 16467484.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.3875668695812581, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.05059356498531997, "learning_rate": 1.5379783989984277e-05, "loss": 0.002, "num_tokens": 16473311.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.38775133739162515, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.0318365809507668, "learning_rate": 1.5374353943487592e-05, "loss": 0.0013, "num_tokens": 16479536.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3879358052019923, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.08126905700191855, "learning_rate": 1.5368921667824945e-05, "loss": 0.0033, "num_tokens": 16486159.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 139.625, "completions/mean_terminated_length": 139.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.38812027301235935, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.09959089336916804, "learning_rate": 1.5363487165249522e-05, "loss": 0.004, "num_tokens": 16493476.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.3883047408227264, "frac_reward_zero_std": 1.0, "grad_norm": 0.255859375, "kl": 0.09049356263130903, "learning_rate": 1.5358050438015443e-05, "loss": 0.0036, "num_tokens": 16501733.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 129.25, "completions/mean_terminated_length": 129.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.38848920863309355, "frac_reward_zero_std": 1.0, "grad_norm": 0.265625, "kl": 0.08977415831759572, "learning_rate": 1.5352611488377743e-05, "loss": 0.0036, "num_tokens": 16508095.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.3886736764434606, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.06998494127765298, "learning_rate": 1.534717031859238e-05, "loss": 0.0028, "num_tokens": 16513265.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.3888581442538277, "frac_reward_zero_std": 1.0, "grad_norm": 0.5546875, "kl": 0.10584565624594688, "learning_rate": 1.5341726930916236e-05, "loss": 0.0042, "num_tokens": 16518430.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 380.25, "completions/mean_terminated_length": 380.25, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.38904261206419477, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.028123725205659866, "learning_rate": 1.533628132760711e-05, "loss": 0.0011, "num_tokens": 16531728.0, "reward": 1.1639344692230225, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.16393442451953888, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3892270798745619, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.08719945047050714, "learning_rate": 1.533083351092372e-05, "loss": 0.0035, "num_tokens": 16540159.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.38941154768492897, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.06535596633329988, "learning_rate": 1.5325383483125703e-05, "loss": 0.0026, "num_tokens": 16550429.0, "reward": 1.7708332538604736, "reward_std": 0.34107092022895813, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.34107092022895813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 296.625, "completions/mean_terminated_length": 296.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.38959601549529604, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.030416505178436637, "learning_rate": 1.531993124647361e-05, "loss": 0.0012, "num_tokens": 16556610.0, "reward": 1.9249999523162842, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 407.625, "completions/mean_terminated_length": 407.625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.3897804833056632, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.057816775515675545, "learning_rate": 1.5314476803228918e-05, "loss": 0.0023, "num_tokens": 16566335.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 251.0, "completions/mean_terminated_length": 251.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.38996495111603025, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.052903703879565, "learning_rate": 1.5309020155654007e-05, "loss": 0.0021, "num_tokens": 16571519.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 439.25, "completions/mean_terminated_length": 439.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.3901494189263973, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.05139669170603156, "learning_rate": 1.5303561306012174e-05, "loss": 0.0021, "num_tokens": 16580081.0, "reward": 1.923076868057251, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9230769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.39033388673676445, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.10728083550930023, "learning_rate": 1.529810025656764e-05, "loss": 0.0043, "num_tokens": 16584679.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.3905183545471315, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.07568383729085326, "learning_rate": 1.5292637009585524e-05, "loss": 0.003, "num_tokens": 16592583.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 284.75, "completions/mean_terminated_length": 284.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3907028223574986, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.06489233998581767, "learning_rate": 1.5287171567331866e-05, "loss": 0.0026, "num_tokens": 16598197.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.3908872901678657, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0506912344135344, "learning_rate": 1.5281703932073613e-05, "loss": 0.002, "num_tokens": 16602360.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 212.125, "completions/mean_terminated_length": 212.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.3910717579782328, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.06867270031943917, "learning_rate": 1.527623410607862e-05, "loss": 0.0027, "num_tokens": 16610673.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 234.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.39125622578859987, "frac_reward_zero_std": 1.0, "grad_norm": 1.0390625, "kl": 0.08102656516712159, "learning_rate": 1.5270762091615652e-05, "loss": 0.0032, "num_tokens": 16618492.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.391440693598967, "frac_reward_zero_std": 1.0, "grad_norm": 0.2734375, "kl": 0.07038999604992568, "learning_rate": 1.526528789095439e-05, "loss": 0.0028, "num_tokens": 16624539.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.39162516140933407, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.08400578889995813, "learning_rate": 1.5259811506365402e-05, "loss": 0.0034, "num_tokens": 16629446.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.39180962921970114, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.04788930423092097, "learning_rate": 1.5254332940120184e-05, "loss": 0.0019, "num_tokens": 16638298.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.39199409703006827, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.04541666666045785, "learning_rate": 1.5248852194491124e-05, "loss": 0.0018, "num_tokens": 16649240.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.39217856484043534, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.0658180033788085, "learning_rate": 1.5243369271751514e-05, "loss": 0.0026, "num_tokens": 16658424.0, "reward": 1.5625, "reward_std": 0.6781013607978821, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 145.875, "completions/mean_terminated_length": 145.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.3923630326508024, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.07362252427265048, "learning_rate": 1.523788417417556e-05, "loss": 0.0029, "num_tokens": 16662415.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 200.875, "completions/mean_terminated_length": 200.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.39254750046116954, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.07338337576948106, "learning_rate": 1.5232396904038352e-05, "loss": 0.0029, "num_tokens": 16669246.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.3927319682715366, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.05722016096115112, "learning_rate": 1.5226907463615898e-05, "loss": 0.0023, "num_tokens": 16676860.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 365.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.3929164360819037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.04458330781199038, "learning_rate": 1.5221415855185098e-05, "loss": 0.0018, "num_tokens": 16683166.0, "reward": 1.7999999523162842, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 511.625, "completions/mean_terminated_length": 511.625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.3931009038922708, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.06302335031796247, "learning_rate": 1.5215922081023748e-05, "loss": 0.0025, "num_tokens": 16694675.0, "reward": 1.9924242496490479, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9924242496490479, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3932853717026379, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.06999094621278346, "learning_rate": 1.5210426143410554e-05, "loss": 0.0028, "num_tokens": 16703533.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.39346983951300496, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.08582563837990165, "learning_rate": 1.5204928044625106e-05, "loss": 0.0034, "num_tokens": 16710990.0, "reward": 1.8928570747375488, "reward_std": 0.30304577946662903, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.30304577946662903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.3936543073233721, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.06586863519623876, "learning_rate": 1.51994277869479e-05, "loss": 0.0026, "num_tokens": 16716311.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.39383877513373916, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.07721380144357681, "learning_rate": 1.5193925372660328e-05, "loss": 0.0031, "num_tokens": 16720530.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 355.625, "completions/mean_terminated_length": 355.625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.39402324294410623, "frac_reward_zero_std": 1.0, "grad_norm": 0.34375, "kl": 0.09563943604007363, "learning_rate": 1.5188420804044663e-05, "loss": 0.0038, "num_tokens": 16727327.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.39420771075447336, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.07430367637425661, "learning_rate": 1.518291408338409e-05, "loss": 0.003, "num_tokens": 16736479.0, "reward": 1.2819766998291016, "reward_std": 0.6965869069099426, "rewards/fixed_code_pass_all_test_reward/mean": 0.40697675943374634, "rewards/fixed_code_pass_all_test_reward/std": 0.4939172565937042, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 263.875, "completions/mean_terminated_length": 263.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.39439217856484043, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.07479282445274293, "learning_rate": 1.5177405212962672e-05, "loss": 0.003, "num_tokens": 16742726.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 633.375, "completions/mean_terminated_length": 633.375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.3945766463752075, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.027018562774173915, "learning_rate": 1.5171894195065374e-05, "loss": 0.0011, "num_tokens": 16759001.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 207.5, "completions/mean_terminated_length": 207.5, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.39476111418557464, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.08913787826895714, "learning_rate": 1.5166381031978044e-05, "loss": 0.0036, "num_tokens": 16767789.0, "reward": 1.4806547164916992, "reward_std": 0.2231660783290863, "rewards/fixed_code_pass_all_test_reward/mean": 0.480654776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.2231660932302475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 395.25, "completions/mean_terminated_length": 395.25, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.3949455819959417, "frac_reward_zero_std": 1.0, "grad_norm": 0.61328125, "kl": 0.06631882023066282, "learning_rate": 1.5160865725987424e-05, "loss": 0.0027, "num_tokens": 16776255.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.3951300498063088, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.06657646922394633, "learning_rate": 1.5155348279381143e-05, "loss": 0.0027, "num_tokens": 16780717.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 379.875, "completions/mean_terminated_length": 379.875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3953145176166759, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.05815658322535455, "learning_rate": 1.514982869444772e-05, "loss": 0.0023, "num_tokens": 16789836.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 682.0, "completions/mean_terminated_length": 682.0, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 0.395498985427043, "frac_reward_zero_std": 0.0, "grad_norm": 0.69921875, "kl": 0.02320441952906549, "learning_rate": 1.5144306973476562e-05, "loss": 0.0009, "num_tokens": 16802700.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.39568345323741005, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "kl": 0.08900434547103941, "learning_rate": 1.5138783118757951e-05, "loss": 0.0036, "num_tokens": 16811557.0, "reward": 1.048076868057251, "reward_std": 0.4311174154281616, "rewards/fixed_code_pass_all_test_reward/mean": 0.17307692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.10682539641857147, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.3958679210477772, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.08592150639742613, "learning_rate": 1.5133257132583074e-05, "loss": 0.0034, "num_tokens": 16820332.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 361.75, "completions/mean_terminated_length": 361.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.39605238885814426, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.06575191067531705, "learning_rate": 1.5127729017243983e-05, "loss": 0.0026, "num_tokens": 16831058.0, "reward": 1.9117646217346191, "reward_std": 0.22673510015010834, "rewards/fixed_code_pass_all_test_reward/mean": 0.9117646813392639, "rewards/fixed_code_pass_all_test_reward/std": 0.22673507034778595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.39623685666851133, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.06063879746943712, "learning_rate": 1.5122198775033626e-05, "loss": 0.0024, "num_tokens": 16835078.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 311.5, "completions/mean_terminated_length": 311.5, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.39642132447887846, "frac_reward_zero_std": 1.0, "grad_norm": 0.033203125, "kl": 0.01461614586878568, "learning_rate": 1.5116666408245819e-05, "loss": 0.0006, "num_tokens": 16841274.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.39660579228924553, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05741693219169974, "learning_rate": 1.5111131919175277e-05, "loss": 0.0023, "num_tokens": 16850505.0, "reward": 1.55978262424469, "reward_std": 0.6075577139854431, "rewards/fixed_code_pass_all_test_reward/mean": 0.6847826242446899, "rewards/fixed_code_pass_all_test_reward/std": 0.34175604581832886, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 377.875, "completions/mean_terminated_length": 377.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3967902600996126, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.054730904987081885, "learning_rate": 1.5105595310117584e-05, "loss": 0.0022, "num_tokens": 16860760.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 429.0, "completions/mean_terminated_length": 429.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.39697472790997973, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.05390038341283798, "learning_rate": 1.5100056583369207e-05, "loss": 0.0022, "num_tokens": 16869328.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 503.375, "completions/mean_terminated_length": 503.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.3971591957203468, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.06171317817643285, "learning_rate": 1.5094515741227489e-05, "loss": 0.0025, "num_tokens": 16883371.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 111.625, "completions/mean_terminated_length": 111.625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.3973436635307139, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.1157530490309, "learning_rate": 1.508897278599065e-05, "loss": 0.0046, "num_tokens": 16886928.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.397528131341081, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.061378602869808674, "learning_rate": 1.5083427719957792e-05, "loss": 0.0025, "num_tokens": 16899411.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 393.625, "completions/mean_terminated_length": 393.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.3977125991514481, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.08036471251398325, "learning_rate": 1.5077880545428886e-05, "loss": 0.0032, "num_tokens": 16910128.0, "reward": 1.702830195426941, "reward_std": 0.43434298038482666, "rewards/fixed_code_pass_all_test_reward/mean": 0.8278301954269409, "rewards/fixed_code_pass_all_test_reward/std": 0.3359243869781494, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.39789706696181515, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.055068833753466606, "learning_rate": 1.507233126470478e-05, "loss": 0.0022, "num_tokens": 16915493.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3980815347721823, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.06810943176969886, "learning_rate": 1.5066779880087197e-05, "loss": 0.0027, "num_tokens": 16919987.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.39826600258254935, "frac_reward_zero_std": 1.0, "grad_norm": 1.2265625, "kl": 0.11746348394080997, "learning_rate": 1.5061226393878732e-05, "loss": 0.0047, "num_tokens": 16924885.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3984504703929164, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.07588697969913483, "learning_rate": 1.5055670808382848e-05, "loss": 0.003, "num_tokens": 16936049.0, "reward": 1.2378641366958618, "reward_std": 0.04494272544980049, "rewards/fixed_code_pass_all_test_reward/mean": 0.23786407709121704, "rewards/fixed_code_pass_all_test_reward/std": 0.04494272544980049, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 214.625, "completions/mean_terminated_length": 214.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.39863493820328355, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.06469164998270571, "learning_rate": 1.5050113125903888e-05, "loss": 0.0026, "num_tokens": 16941798.0, "reward": 1.628676414489746, "reward_std": 0.4687298834323883, "rewards/fixed_code_pass_all_test_reward/mean": 0.6286764740943909, "rewards/fixed_code_pass_all_test_reward/std": 0.4687299132347107, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 422.625, "completions/mean_terminated_length": 422.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.3988194060136506, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.03604648669715971, "learning_rate": 1.504455334874705e-05, "loss": 0.0014, "num_tokens": 16950131.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 382.125, "completions/mean_terminated_length": 382.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3990038738240177, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.05727647803723812, "learning_rate": 1.5038991479218417e-05, "loss": 0.0023, "num_tokens": 16964060.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.3991883416343848, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.09642468672245741, "learning_rate": 1.503342751962493e-05, "loss": 0.0039, "num_tokens": 16969231.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.3993728094447519, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0387020034249872, "learning_rate": 1.5027861472274396e-05, "loss": 0.0015, "num_tokens": 16973699.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.39955727725511897, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.0747916353866458, "learning_rate": 1.50222933394755e-05, "loss": 0.003, "num_tokens": 16978054.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.3997417450654861, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.07223208481445909, "learning_rate": 1.5016723123537773e-05, "loss": 0.0029, "num_tokens": 16985427.0, "reward": 1.9285714626312256, "reward_std": 0.141718789935112, "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.1417188197374344, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 198.5, "completions/mean_terminated_length": 198.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.39992621287585317, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.07610294921323657, "learning_rate": 1.5011150826771626e-05, "loss": 0.003, "num_tokens": 16989999.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.40011068068622024, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.05206793872639537, "learning_rate": 1.5005576451488324e-05, "loss": 0.0021, "num_tokens": 16993811.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.40029514849658737, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.08494254294782877, "learning_rate": 1.5000000000000002e-05, "loss": 0.0034, "num_tokens": 17001948.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.40047961630695444, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0696095353923738, "learning_rate": 1.4994421474619648e-05, "loss": 0.0028, "num_tokens": 17011451.0, "reward": 1.0750000476837158, "reward_std": 0.4773438274860382, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.21380901336669922, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.4006640841173215, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.09843532973900437, "learning_rate": 1.4988840877661117e-05, "loss": 0.0039, "num_tokens": 17019048.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.40084855192768865, "frac_reward_zero_std": 1.0, "grad_norm": 0.1748046875, "kl": 0.0645972965285182, "learning_rate": 1.4983258211439118e-05, "loss": 0.0026, "num_tokens": 17025990.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4010330197380557, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.10106193367391825, "learning_rate": 1.4977673478269217e-05, "loss": 0.004, "num_tokens": 17035580.0, "reward": 1.0165441036224365, "reward_std": 0.6307845115661621, "rewards/fixed_code_pass_all_test_reward/mean": 0.2665441036224365, "rewards/fixed_code_pass_all_test_reward/std": 0.1768968105316162, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.4012174875484228, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.07301073893904686, "learning_rate": 1.4972086680467847e-05, "loss": 0.0029, "num_tokens": 17043467.0, "reward": 1.5437500476837158, "reward_std": 0.12082301080226898, "rewards/fixed_code_pass_all_test_reward/mean": 0.543749988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.12082307040691376, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 341.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.40140195535878986, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.07347873412072659, "learning_rate": 1.4966497820352289e-05, "loss": 0.0029, "num_tokens": 17054412.0, "reward": 1.9874999523162842, "reward_std": 0.035355329513549805, "rewards/fixed_code_pass_all_test_reward/mean": 0.987500011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553481400013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.401586423169157, "frac_reward_zero_std": 1.0, "grad_norm": 0.21875, "kl": 0.07371348980814219, "learning_rate": 1.496090690024068e-05, "loss": 0.0029, "num_tokens": 17057904.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 276.75, "completions/mean_terminated_length": 276.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.40177089097952406, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.04874282097443938, "learning_rate": 1.4955313922452015e-05, "loss": 0.0019, "num_tokens": 17066710.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.40195535878989114, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.0752912680618465, "learning_rate": 1.4949718889306137e-05, "loss": 0.003, "num_tokens": 17075485.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 285.75, "completions/mean_terminated_length": 285.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.40213982660025827, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0750751057639718, "learning_rate": 1.4944121803123751e-05, "loss": 0.003, "num_tokens": 17084603.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 629.75, "completions/mean_terminated_length": 629.75, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.40232429441062534, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.04606401827186346, "learning_rate": 1.49385226662264e-05, "loss": 0.0018, "num_tokens": 17103049.0, "reward": 1.25, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.4025087622209924, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.07699226797558367, "learning_rate": 1.4932921480936491e-05, "loss": 0.0031, "num_tokens": 17107284.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 417.25, "completions/mean_terminated_length": 417.25, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.40269323003135954, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.06068625417537987, "learning_rate": 1.492731824957727e-05, "loss": 0.0024, "num_tokens": 17118542.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 253.0, "completions/mean_terminated_length": 253.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.4028776978417266, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.07746365154162049, "learning_rate": 1.4921712974472841e-05, "loss": 0.0031, "num_tokens": 17126358.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.4030621656520937, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0674064108170569, "learning_rate": 1.4916105657948147e-05, "loss": 0.0027, "num_tokens": 17135223.0, "reward": 1.3928570747375488, "reward_std": 0.3042459785938263, "rewards/fixed_code_pass_all_test_reward/mean": 0.3928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.3042459487915039, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.4032466334624608, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.05836702208034694, "learning_rate": 1.4910496302328988e-05, "loss": 0.0023, "num_tokens": 17141512.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.4034311012728279, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.033646262250840664, "learning_rate": 1.4904884909941994e-05, "loss": 0.0013, "num_tokens": 17148735.0, "reward": 1.9722222089767456, "reward_std": 0.07856743782758713, "rewards/fixed_code_pass_all_test_reward/mean": 0.9722222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.07856741547584534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.40361556908319496, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.06701509840786457, "learning_rate": 1.489927148311466e-05, "loss": 0.0027, "num_tokens": 17158043.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.4038000368935621, "frac_reward_zero_std": 1.0, "grad_norm": 0.380859375, "kl": 0.05827523279003799, "learning_rate": 1.4893656024175307e-05, "loss": 0.0023, "num_tokens": 17163901.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.40398450470392916, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.12816407717764378, "learning_rate": 1.4888038535453107e-05, "loss": 0.0051, "num_tokens": 17171214.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.40416897251429623, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.08881997084245086, "learning_rate": 1.4882419019278075e-05, "loss": 0.0036, "num_tokens": 17181143.0, "reward": 1.7114661931991577, "reward_std": 0.4518243372440338, "rewards/fixed_code_pass_all_test_reward/mean": 0.7114661931991577, "rewards/fixed_code_pass_all_test_reward/std": 0.4518243670463562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.40435344032466336, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.04632115759886801, "learning_rate": 1.4876797477981061e-05, "loss": 0.0019, "num_tokens": 17187852.0, "reward": 1.625, "reward_std": 0.1515229046344757, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.40453790813503043, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.06617632892448455, "learning_rate": 1.4871173913893766e-05, "loss": 0.0026, "num_tokens": 17197172.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4047223759453975, "frac_reward_zero_std": 1.0, "grad_norm": 0.365234375, "kl": 0.10388125013560057, "learning_rate": 1.4865548329348717e-05, "loss": 0.0042, "num_tokens": 17206305.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.40490684375576463, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "kl": 0.06814601551741362, "learning_rate": 1.4859920726679287e-05, "loss": 0.0027, "num_tokens": 17210521.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4050913115661317, "frac_reward_zero_std": 1.0, "grad_norm": 0.1845703125, "kl": 0.10374442534521222, "learning_rate": 1.4854291108219685e-05, "loss": 0.0041, "num_tokens": 17218457.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.4052757793764988, "frac_reward_zero_std": 1.0, "grad_norm": 0.248046875, "kl": 0.08447891473770142, "learning_rate": 1.4848659476304951e-05, "loss": 0.0034, "num_tokens": 17222388.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 286.75, "completions/mean_terminated_length": 286.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.4054602471868659, "frac_reward_zero_std": 1.0, "grad_norm": 0.052978515625, "kl": 0.025556190870702267, "learning_rate": 1.4843025833270973e-05, "loss": 0.001, "num_tokens": 17228386.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.405644714997233, "frac_reward_zero_std": 1.0, "grad_norm": 1.6796875, "kl": 0.11554331751540303, "learning_rate": 1.4837390181454455e-05, "loss": 0.0046, "num_tokens": 17232250.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.40582918280760005, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.09681805921718478, "learning_rate": 1.4831752523192949e-05, "loss": 0.0039, "num_tokens": 17240594.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.4060136506179672, "frac_reward_zero_std": 0.0, "grad_norm": 3.421875, "kl": 0.16851473227143288, "learning_rate": 1.482611286082483e-05, "loss": 0.0067, "num_tokens": 17248708.0, "reward": 1.1050000190734863, "reward_std": 0.07690441608428955, "rewards/fixed_code_pass_all_test_reward/mean": 0.10499999672174454, "rewards/fixed_code_pass_all_test_reward/std": 0.07690439373254776, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 280.25, "completions/mean_terminated_length": 280.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.40619811842833425, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.07621269626542926, "learning_rate": 1.4820471196689314e-05, "loss": 0.003, "num_tokens": 17258830.0, "reward": 1.9679054021835327, "reward_std": 0.09077723324298859, "rewards/fixed_code_pass_all_test_reward/mean": 0.9679054021835327, "rewards/fixed_code_pass_all_test_reward/std": 0.09077723324298859, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4063825862387013, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.07183301774784923, "learning_rate": 1.4814827533126436e-05, "loss": 0.0029, "num_tokens": 17268731.0, "reward": 1.90625, "reward_std": 0.0776323452591896, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.0776323676109314, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.40656705404906845, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.07773705758154392, "learning_rate": 1.480918187247707e-05, "loss": 0.0031, "num_tokens": 17279286.0, "reward": 0.9510869383811951, "reward_std": 0.3842971622943878, "rewards/fixed_code_pass_all_test_reward/mean": 0.07608695328235626, "rewards/fixed_code_pass_all_test_reward/std": 0.030743775889277458, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.4067515218594355, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.05165184196084738, "learning_rate": 1.480353421708291e-05, "loss": 0.0021, "num_tokens": 17287069.0, "reward": 1.3518518209457397, "reward_std": 0.22178086638450623, "rewards/fixed_code_pass_all_test_reward/mean": 0.35185185074806213, "rewards/fixed_code_pass_all_test_reward/std": 0.22178086638450623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.4069359896698026, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.061443733517080545, "learning_rate": 1.4797884569286484e-05, "loss": 0.0025, "num_tokens": 17298249.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 389.5, "completions/mean_terminated_length": 389.5, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.40712045748016973, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.05091904290020466, "learning_rate": 1.4792232931431147e-05, "loss": 0.002, "num_tokens": 17307053.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.4073049252905368, "frac_reward_zero_std": 1.0, "grad_norm": 0.046142578125, "kl": 0.025621530483476818, "learning_rate": 1.4786579305861069e-05, "loss": 0.001, "num_tokens": 17317268.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.4074893931009039, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.04336713463999331, "learning_rate": 1.4780923694921257e-05, "loss": 0.0017, "num_tokens": 17328901.0, "reward": 1.9008264541625977, "reward_std": 0.061211276799440384, "rewards/fixed_code_pass_all_test_reward/mean": 0.9008264541625977, "rewards/fixed_code_pass_all_test_reward/std": 0.061211250722408295, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 157.75, "completions/mean_terminated_length": 157.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.407673860911271, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.14041194133460522, "learning_rate": 1.4775266100957533e-05, "loss": 0.0056, "num_tokens": 17337803.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.4078583287216381, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.06668876390904188, "learning_rate": 1.4769606526316549e-05, "loss": 0.0027, "num_tokens": 17342231.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.40804279653200515, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.07254371343879029, "learning_rate": 1.4763944973345771e-05, "loss": 0.0029, "num_tokens": 17350603.0, "reward": 1.9473683834075928, "reward_std": 0.056265562772750854, "rewards/fixed_code_pass_all_test_reward/mean": 0.9473683834075928, "rewards/fixed_code_pass_all_test_reward/std": 0.05626553297042847, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4082272643423723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.04762493132147938, "learning_rate": 1.475828144439349e-05, "loss": 0.0019, "num_tokens": 17358027.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.40841173215273935, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.04269046615809202, "learning_rate": 1.4752615941808817e-05, "loss": 0.0017, "num_tokens": 17366647.0, "reward": 1.9291044473648071, "reward_std": 0.20052284002304077, "rewards/fixed_code_pass_all_test_reward/mean": 0.9291044473648071, "rewards/fixed_code_pass_all_test_reward/std": 0.20052282512187958, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.4085961999631064, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.06572181452065706, "learning_rate": 1.4746948467941675e-05, "loss": 0.0026, "num_tokens": 17372866.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.40878066777347355, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.04801651241723448, "learning_rate": 1.4741279025142816e-05, "loss": 0.0019, "num_tokens": 17384401.0, "reward": 1.5340908765792847, "reward_std": 0.2680882513523102, "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.2680882513523102, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 175.625, "completions/mean_terminated_length": 175.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4089651355838406, "frac_reward_zero_std": 0.0, "grad_norm": 3.65625, "kl": 0.08129481226205826, "learning_rate": 1.4735607615763799e-05, "loss": 0.0033, "num_tokens": 17389582.0, "reward": 1.1136364936828613, "reward_std": 0.09409989416599274, "rewards/fixed_code_pass_all_test_reward/mean": 0.11363636702299118, "rewards/fixed_code_pass_all_test_reward/std": 0.09409985691308975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 199.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.4091496033942077, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.11348438821732998, "learning_rate": 1.4729934242157005e-05, "loss": 0.0045, "num_tokens": 17398093.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 211.375, "completions/mean_terminated_length": 211.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4093340712045748, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.04081889207009226, "learning_rate": 1.4724258906675621e-05, "loss": 0.0016, "num_tokens": 17403960.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.4095185390149419, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.05263721733354032, "learning_rate": 1.4718581611673662e-05, "loss": 0.0021, "num_tokens": 17413054.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 361.875, "completions/mean_terminated_length": 361.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.40970300682530897, "frac_reward_zero_std": 0.0, "grad_norm": 16.75, "kl": 0.41489675221964717, "learning_rate": 1.4712902359505941e-05, "loss": 0.0166, "num_tokens": 17423717.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 182.0, "completions/mean_terminated_length": 182.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4098874746356761, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.06852096738293767, "learning_rate": 1.470722115252809e-05, "loss": 0.0027, "num_tokens": 17427885.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 242.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.41007194244604317, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.08015329111367464, "learning_rate": 1.4701537993096553e-05, "loss": 0.0032, "num_tokens": 17436704.0, "reward": 1.7083332538604736, "reward_std": 0.31180480122566223, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.31180480122566223, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 181.375, "completions/mean_terminated_length": 181.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.41025641025641024, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.045000502839684486, "learning_rate": 1.469585288356858e-05, "loss": 0.0018, "num_tokens": 17440971.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.41044087806677737, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.05684004141949117, "learning_rate": 1.4690165826302234e-05, "loss": 0.0023, "num_tokens": 17445902.0, "reward": 1.75, "reward_std": 0.3450327515602112, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.41062534587714444, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.07647980190813541, "learning_rate": 1.4684476823656382e-05, "loss": 0.0031, "num_tokens": 17450532.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.4108098136875115, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.07840629876591265, "learning_rate": 1.4678785877990699e-05, "loss": 0.0031, "num_tokens": 17463053.0, "reward": 1.734375, "reward_std": 0.19408094882965088, "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, "rewards/fixed_code_pass_all_test_reward/std": 0.19408094882965088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.41099428149787864, "frac_reward_zero_std": 0.0, "grad_norm": 11.0, "kl": 0.4483463508076966, "learning_rate": 1.4673092991665664e-05, "loss": 0.0179, "num_tokens": 17470579.0, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.4111787493082457, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.11281880270689726, "learning_rate": 1.466739816704257e-05, "loss": 0.0045, "num_tokens": 17475797.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.4113632171186128, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.09484882932156324, "learning_rate": 1.4661701406483502e-05, "loss": 0.0038, "num_tokens": 17481679.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.4115476849289799, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.07578158937394619, "learning_rate": 1.4656002712351353e-05, "loss": 0.003, "num_tokens": 17486601.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.411732152739347, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.05298419250175357, "learning_rate": 1.4650302087009825e-05, "loss": 0.0021, "num_tokens": 17491402.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 353.125, "completions/mean_terminated_length": 353.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.41191662054971406, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.06112291198223829, "learning_rate": 1.4644599532823403e-05, "loss": 0.0024, "num_tokens": 17504899.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.4121010883600812, "frac_reward_zero_std": 0.0, "grad_norm": 3.421875, "kl": 0.06611500401049852, "learning_rate": 1.4638895052157391e-05, "loss": 0.0026, "num_tokens": 17509656.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.41228555617044826, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.08356986427679658, "learning_rate": 1.4633188647377883e-05, "loss": 0.0033, "num_tokens": 17516734.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 178.0, "completions/mean_terminated_length": 178.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.41247002398081534, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.06750589539296925, "learning_rate": 1.4627480320851775e-05, "loss": 0.0027, "num_tokens": 17521158.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 373.375, "completions/mean_terminated_length": 373.375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.41265449179118247, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.03591419302392751, "learning_rate": 1.4621770074946755e-05, "loss": 0.0014, "num_tokens": 17529921.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.41283895960154954, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0746129360049963, "learning_rate": 1.4616057912031308e-05, "loss": 0.003, "num_tokens": 17540066.0, "reward": 1.4488636255264282, "reward_std": 0.3696240186691284, "rewards/fixed_code_pass_all_test_reward/mean": 0.4488636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.3696240186691284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 204.125, "completions/mean_terminated_length": 204.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.4130234274119166, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.08220534306019545, "learning_rate": 1.4610343834474719e-05, "loss": 0.0033, "num_tokens": 17548843.0, "reward": 1.1299341917037964, "reward_std": 0.4657563865184784, "rewards/fixed_code_pass_all_test_reward/mean": 0.2549342215061188, "rewards/fixed_code_pass_all_test_reward/std": 0.1381690800189972, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 241.0, "completions/mean_terminated_length": 241.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.41320789522228374, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.08537415787577629, "learning_rate": 1.4604627844647062e-05, "loss": 0.0034, "num_tokens": 17556883.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.4133923630326508, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.05751473782584071, "learning_rate": 1.4598909944919207e-05, "loss": 0.0023, "num_tokens": 17561265.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4135768308430179, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.06698378419969231, "learning_rate": 1.4593190137662812e-05, "loss": 0.0027, "num_tokens": 17569395.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.41376129865338496, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.06035445840097964, "learning_rate": 1.4587468425250334e-05, "loss": 0.0024, "num_tokens": 17575218.0, "reward": 1.754166603088379, "reward_std": 0.455282062292099, "rewards/fixed_code_pass_all_test_reward/mean": 0.7541666626930237, "rewards/fixed_code_pass_all_test_reward/std": 0.4552820324897766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 225.25, "completions/mean_terminated_length": 225.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.4139457664637521, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.08246725844219327, "learning_rate": 1.4581744810055016e-05, "loss": 0.0033, "num_tokens": 17582956.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 363.125, "completions/mean_terminated_length": 363.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.41413023427411916, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.05144261009991169, "learning_rate": 1.457601929445089e-05, "loss": 0.0021, "num_tokens": 17590461.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.41431470208448623, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.10308314580470324, "learning_rate": 1.4570291880812772e-05, "loss": 0.0041, "num_tokens": 17601078.0, "reward": 1.4153225421905518, "reward_std": 0.16932740807533264, "rewards/fixed_code_pass_all_test_reward/mean": 0.41532254219055176, "rewards/fixed_code_pass_all_test_reward/std": 0.16932740807533264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 388.25, "completions/mean_terminated_length": 388.25, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.41449916989485336, "frac_reward_zero_std": 1.0, "grad_norm": 0.1708984375, "kl": 0.04320777929387987, "learning_rate": 1.4564562571516273e-05, "loss": 0.0017, "num_tokens": 17614568.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.41468363770522043, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.08569754054769874, "learning_rate": 1.4558831368937785e-05, "loss": 0.0034, "num_tokens": 17623082.0, "reward": 1.1875, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.4148681055155875, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.04912325507029891, "learning_rate": 1.4553098275454487e-05, "loss": 0.002, "num_tokens": 17629801.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 317.25, "completions/mean_terminated_length": 317.25, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.41505257332595463, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.09153992775827646, "learning_rate": 1.4547363293444345e-05, "loss": 0.0037, "num_tokens": 17636283.0, "reward": 1.65625, "reward_std": 0.376485139131546, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.4152370411363217, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.12008462054654956, "learning_rate": 1.4541626425286101e-05, "loss": 0.0048, "num_tokens": 17646382.0, "reward": 1.75, "reward_std": 0.3842463493347168, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.3842463791370392, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4154215089466888, "frac_reward_zero_std": 1.0, "grad_norm": 0.197265625, "kl": 0.10518454108387232, "learning_rate": 1.453588767335929e-05, "loss": 0.0042, "num_tokens": 17652930.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.4156059767570559, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.04976614727638662, "learning_rate": 1.453014704004422e-05, "loss": 0.002, "num_tokens": 17659239.0, "reward": 1.2083332538604736, "reward_std": 0.06299406290054321, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0629940778017044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 296.25, "completions/mean_terminated_length": 296.25, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.415790444567423, "frac_reward_zero_std": 1.0, "grad_norm": 0.1708984375, "kl": 0.07808349002152681, "learning_rate": 1.4524404527721977e-05, "loss": 0.0031, "num_tokens": 17668041.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.41597491237779005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.062146781012415886, "learning_rate": 1.451866013877444e-05, "loss": 0.0025, "num_tokens": 17677338.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.4161593801881572, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.08158776955679059, "learning_rate": 1.451291387558425e-05, "loss": 0.0033, "num_tokens": 17682866.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.41634384799852425, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.037329346407204866, "learning_rate": 1.4507165740534836e-05, "loss": 0.0015, "num_tokens": 17690845.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.4165283158088913, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.06640745140612125, "learning_rate": 1.45014157360104e-05, "loss": 0.0027, "num_tokens": 17698315.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.41671278361925845, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.06016148393973708, "learning_rate": 1.4495663864395921e-05, "loss": 0.0024, "num_tokens": 17704226.0, "reward": 1.579545497894287, "reward_std": 0.13690371811389923, "rewards/fixed_code_pass_all_test_reward/mean": 0.5795454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.13690368831157684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.4168972514296255, "frac_reward_zero_std": 0.0, "grad_norm": 3.625, "kl": 0.1150235217064619, "learning_rate": 1.4489910128077147e-05, "loss": 0.0046, "num_tokens": 17708319.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4170817192399926, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.07665531383827329, "learning_rate": 1.4484154529440609e-05, "loss": 0.0031, "num_tokens": 17712438.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.4172661870503597, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.05605759215541184, "learning_rate": 1.4478397070873602e-05, "loss": 0.0022, "num_tokens": 17719736.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4174506548607268, "frac_reward_zero_std": 0.0, "grad_norm": 4.4375, "kl": 0.12827272340655327, "learning_rate": 1.4472637754764195e-05, "loss": 0.0051, "num_tokens": 17741728.0, "reward": 1.8343374729156494, "reward_std": 0.13718166947364807, "rewards/fixed_code_pass_all_test_reward/mean": 0.8343373537063599, "rewards/fixed_code_pass_all_test_reward/std": 0.13718171417713165, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.4176351226710939, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.0989237753674388, "learning_rate": 1.446687658350123e-05, "loss": 0.004, "num_tokens": 17748213.0, "reward": 1.394736886024475, "reward_std": 0.6669304370880127, "rewards/fixed_code_pass_all_test_reward/mean": 0.5197368264198303, "rewards/fixed_code_pass_all_test_reward/std": 0.4138840138912201, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.417819590481461, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.14350945875048637, "learning_rate": 1.4461113559474313e-05, "loss": 0.0057, "num_tokens": 17755511.0, "reward": 1.8775510787963867, "reward_std": 0.02439240738749504, "rewards/fixed_code_pass_all_test_reward/mean": 0.8775510191917419, "rewards/fixed_code_pass_all_test_reward/std": 0.02439240925014019, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.4180040582918281, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.05686687584966421, "learning_rate": 1.4455348685073827e-05, "loss": 0.0023, "num_tokens": 17764099.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.41818852610219515, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.06077292747795582, "learning_rate": 1.4449581962690911e-05, "loss": 0.0024, "num_tokens": 17769387.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 129.25, "completions/mean_terminated_length": 129.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.4183729939125623, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.08696111221797764, "learning_rate": 1.4443813394717485e-05, "loss": 0.0035, "num_tokens": 17776037.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.41855746172292935, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.02779914962593466, "learning_rate": 1.4438042983546216e-05, "loss": 0.0011, "num_tokens": 17783328.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4187419295332964, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.07328482205048203, "learning_rate": 1.443227073157056e-05, "loss": 0.0029, "num_tokens": 17787277.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.41892639734366355, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.07957849558442831, "learning_rate": 1.4426496641184706e-05, "loss": 0.0032, "num_tokens": 17796558.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.4191108651540306, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.041133893420919776, "learning_rate": 1.4420720714783635e-05, "loss": 0.0016, "num_tokens": 17801519.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.4192953329643977, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.09733851626515388, "learning_rate": 1.4414942954763071e-05, "loss": 0.0039, "num_tokens": 17809733.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.4194798007747648, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.06036228500306606, "learning_rate": 1.4409163363519503e-05, "loss": 0.0024, "num_tokens": 17816940.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 136.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.4196642685851319, "frac_reward_zero_std": 1.0, "grad_norm": 0.3515625, "kl": 0.09297371748834848, "learning_rate": 1.4403381943450186e-05, "loss": 0.0037, "num_tokens": 17823499.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.41984873639549897, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.0828865715302527, "learning_rate": 1.4397598696953124e-05, "loss": 0.0033, "num_tokens": 17831911.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 353.125, "completions/mean_terminated_length": 353.125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.4200332042058661, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.05399620532989502, "learning_rate": 1.4391813626427083e-05, "loss": 0.0022, "num_tokens": 17839544.0, "reward": 1.5681817531585693, "reward_std": 0.19284728169441223, "rewards/fixed_code_pass_all_test_reward/mean": 0.5681818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.19284729659557343, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.42021767201623317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.054824480786919594, "learning_rate": 1.4386026734271583e-05, "loss": 0.0022, "num_tokens": 17849565.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.42040213982660024, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.09965190105140209, "learning_rate": 1.438023802288691e-05, "loss": 0.004, "num_tokens": 17857778.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 484.75, "completions/mean_terminated_length": 484.75, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.42058660763696737, "frac_reward_zero_std": 0.0, "grad_norm": 0.64453125, "kl": 0.02528570790309459, "learning_rate": 1.4374447494674088e-05, "loss": 0.001, "num_tokens": 17870224.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.42077107544733444, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.0731535186059773, "learning_rate": 1.4368655152034906e-05, "loss": 0.0029, "num_tokens": 17876961.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.4209555432577015, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.10476860590279102, "learning_rate": 1.4362860997371908e-05, "loss": 0.0042, "num_tokens": 17882373.0, "reward": 1.809999942779541, "reward_std": 0.35181164741516113, "rewards/fixed_code_pass_all_test_reward/mean": 0.8100000023841858, "rewards/fixed_code_pass_all_test_reward/std": 0.35181164741516113, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.42114001106806864, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.09875359293073416, "learning_rate": 1.4357065033088376e-05, "loss": 0.004, "num_tokens": 17891647.0, "reward": 1.5978261232376099, "reward_std": 0.1781320869922638, "rewards/fixed_code_pass_all_test_reward/mean": 0.5978260636329651, "rewards/fixed_code_pass_all_test_reward/std": 0.1781320571899414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4213244788784357, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.09526335587725043, "learning_rate": 1.4351267261588352e-05, "loss": 0.0038, "num_tokens": 17897246.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 307.0, "completions/mean_terminated_length": 307.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4215089466888028, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.07422452419996262, "learning_rate": 1.434546768527663e-05, "loss": 0.003, "num_tokens": 17904486.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.4216934144991699, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.12209584098309278, "learning_rate": 1.433966630655875e-05, "loss": 0.0049, "num_tokens": 17908474.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 244.0, "completions/mean_terminated_length": 244.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.421877882309537, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.062362268567085266, "learning_rate": 1.4333863127840993e-05, "loss": 0.0025, "num_tokens": 17916458.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.42206235011990406, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.0645166770555079, "learning_rate": 1.4328058151530397e-05, "loss": 0.0026, "num_tokens": 17923586.0, "reward": 1.7533783912658691, "reward_std": 0.45671162009239197, "rewards/fixed_code_pass_all_test_reward/mean": 0.7533783912658691, "rewards/fixed_code_pass_all_test_reward/std": 0.45671164989471436, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 361.5, "completions/mean_terminated_length": 361.5, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.4222468179302712, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.04229564475826919, "learning_rate": 1.432225138003474e-05, "loss": 0.0017, "num_tokens": 17932070.0, "reward": 1.7083332538604736, "reward_std": 0.26888954639434814, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.26888954639434814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.42243128574063826, "frac_reward_zero_std": 0.0, "grad_norm": 3.6875, "kl": 0.11315606161952019, "learning_rate": 1.4316442815762543e-05, "loss": 0.0045, "num_tokens": 17936403.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.42261575355100534, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.08062117593362927, "learning_rate": 1.4310632461123075e-05, "loss": 0.0032, "num_tokens": 17941939.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.42280022136137246, "frac_reward_zero_std": 1.0, "grad_norm": 0.33984375, "kl": 0.09701225208118558, "learning_rate": 1.4304820318526344e-05, "loss": 0.0039, "num_tokens": 17947740.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.42298468917173954, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.10700548999011517, "learning_rate": 1.4299006390383106e-05, "loss": 0.0043, "num_tokens": 17955166.0, "reward": 1.5724170207977295, "reward_std": 0.635352373123169, "rewards/fixed_code_pass_all_test_reward/mean": 0.6974169611930847, "rewards/fixed_code_pass_all_test_reward/std": 0.2817990183830261, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.4231691569821066, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.09630985790863633, "learning_rate": 1.4293190679104845e-05, "loss": 0.0039, "num_tokens": 17959774.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 472.25, "completions/mean_terminated_length": 472.25, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.42335362479247374, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.03610663185827434, "learning_rate": 1.4287373187103799e-05, "loss": 0.0014, "num_tokens": 17968552.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 360.5, "completions/mean_terminated_length": 360.5, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.4235380926028408, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.04209729307331145, "learning_rate": 1.4281553916792936e-05, "loss": 0.0017, "num_tokens": 17975508.0, "reward": 1.6875, "reward_std": 0.3603074252605438, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.24164614081382751, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.4237225604132079, "frac_reward_zero_std": 0.0, "grad_norm": 3.484375, "kl": 0.06692853290587664, "learning_rate": 1.4275732870585963e-05, "loss": 0.0027, "num_tokens": 17979370.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 253.875, "completions/mean_terminated_length": 253.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.423907028223575, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.12152359681203961, "learning_rate": 1.4269910050897327e-05, "loss": 0.0049, "num_tokens": 17988809.0, "reward": 0.9464285373687744, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.4240914960339421, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.048552344320341945, "learning_rate": 1.4264085460142202e-05, "loss": 0.0019, "num_tokens": 17994811.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.42427596384430916, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.06158851785585284, "learning_rate": 1.4258259100736508e-05, "loss": 0.0025, "num_tokens": 18000125.0, "reward": 1.875, "reward_std": 0.17478273808956146, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.17478272318840027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 392.625, "completions/mean_terminated_length": 392.625, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.4244604316546763, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.09850888093933463, "learning_rate": 1.4252430975096893e-05, "loss": 0.0039, "num_tokens": 18009106.0, "reward": 1.7213542461395264, "reward_std": 0.022097086533904076, "rewards/fixed_code_pass_all_test_reward/mean": 0.7213541865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.022097086533904076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.42464489946504336, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.06322441063821316, "learning_rate": 1.4246601085640734e-05, "loss": 0.0025, "num_tokens": 18014262.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 228.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.42482936727541043, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.09421087941154838, "learning_rate": 1.4240769434786146e-05, "loss": 0.0038, "num_tokens": 18023094.0, "reward": 1.1284722089767456, "reward_std": 0.07111094146966934, "rewards/fixed_code_pass_all_test_reward/mean": 0.1284722238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.07111096382141113, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.42501383508577756, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.07447052653878927, "learning_rate": 1.4234936024951971e-05, "loss": 0.003, "num_tokens": 18028886.0, "reward": 1.399999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.42519830289614463, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.09188799792900681, "learning_rate": 1.422910085855778e-05, "loss": 0.0037, "num_tokens": 18036480.0, "reward": 1.5458333492279053, "reward_std": 0.2130187600851059, "rewards/fixed_code_pass_all_test_reward/mean": 0.5458333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.2130187302827835, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4253827707065117, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.0649915721733123, "learning_rate": 1.4223263938023876e-05, "loss": 0.0026, "num_tokens": 18040343.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 516.375, "completions/mean_terminated_length": 516.375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.42556723851687883, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.061228420585393906, "learning_rate": 1.421742526577129e-05, "loss": 0.0024, "num_tokens": 18055562.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 122.25, "completions/mean_terminated_length": 122.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4257517063272459, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.10539632756263018, "learning_rate": 1.4211584844221771e-05, "loss": 0.0042, "num_tokens": 18062716.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.425936174137613, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.09498989954590797, "learning_rate": 1.4205742675797805e-05, "loss": 0.0038, "num_tokens": 18073305.0, "reward": 1.5217390060424805, "reward_std": 0.36227741837501526, "rewards/fixed_code_pass_all_test_reward/mean": 0.52173912525177, "rewards/fixed_code_pass_all_test_reward/std": 0.36227744817733765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.42612064194798005, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.05078936321660876, "learning_rate": 1.4199898762922593e-05, "loss": 0.002, "num_tokens": 18079789.0, "reward": 1.3571429252624512, "reward_std": 0.39677998423576355, "rewards/fixed_code_pass_all_test_reward/mean": 0.3571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3967800438404083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.4263051097583472, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.03258274798281491, "learning_rate": 1.4194053108020067e-05, "loss": 0.0013, "num_tokens": 18090209.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 224.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.42648957756871425, "frac_reward_zero_std": 1.0, "grad_norm": 0.46484375, "kl": 0.1254149954766035, "learning_rate": 1.4188205713514877e-05, "loss": 0.005, "num_tokens": 18097257.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.4266740453790813, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.05890146712772548, "learning_rate": 1.4182356581832398e-05, "loss": 0.0024, "num_tokens": 18102349.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 120.375, "completions/mean_terminated_length": 120.375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.42685851318944845, "frac_reward_zero_std": 0.0, "grad_norm": 3.484375, "kl": 0.13262345176190138, "learning_rate": 1.417650571539872e-05, "loss": 0.0053, "num_tokens": 18109888.0, "reward": 1.1966667175292969, "reward_std": 0.04320497065782547, "rewards/fixed_code_pass_all_test_reward/mean": 0.1966666579246521, "rewards/fixed_code_pass_all_test_reward/std": 0.04320494830608368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 307.375, "completions/mean_terminated_length": 307.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.4270429809998155, "frac_reward_zero_std": 0.0, "grad_norm": 5.25, "kl": 0.2973592160269618, "learning_rate": 1.4170653116640658e-05, "loss": 0.0119, "num_tokens": 18116971.0, "reward": 1.3575581312179565, "reward_std": 0.6232678890228271, "rewards/fixed_code_pass_all_test_reward/mean": 0.6075581312179565, "rewards/fixed_code_pass_all_test_reward/std": 0.32568520307540894, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4272274488101826, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.07538188993930817, "learning_rate": 1.4164798787985747e-05, "loss": 0.003, "num_tokens": 18125592.0, "reward": 1.5986841917037964, "reward_std": 0.37946653366088867, "rewards/fixed_code_pass_all_test_reward/mean": 0.5986841917037964, "rewards/fixed_code_pass_all_test_reward/std": 0.37946656346321106, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.4274119166205497, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.058528524823486805, "learning_rate": 1.4158942731862229e-05, "loss": 0.0023, "num_tokens": 18131719.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 397.5, "completions/mean_terminated_length": 397.5, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.4275963844309168, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.03626931319013238, "learning_rate": 1.4153084950699077e-05, "loss": 0.0015, "num_tokens": 18140443.0, "reward": 1.2291666269302368, "reward_std": 0.03857587277889252, "rewards/fixed_code_pass_all_test_reward/mean": 0.2291666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.03857583552598953, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.42778085224128387, "frac_reward_zero_std": 1.0, "grad_norm": 0.044189453125, "kl": 0.032341688056476414, "learning_rate": 1.4147225446925965e-05, "loss": 0.0013, "num_tokens": 18147440.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.427965320051651, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.060268617467954755, "learning_rate": 1.4141364222973295e-05, "loss": 0.0024, "num_tokens": 18151563.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 343.625, "completions/mean_terminated_length": 343.625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.42814978786201807, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.04814756428822875, "learning_rate": 1.4135501281272172e-05, "loss": 0.0019, "num_tokens": 18159032.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.42833425567238514, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.056886436650529504, "learning_rate": 1.4129636624254422e-05, "loss": 0.0023, "num_tokens": 18164480.0, "reward": 1.7321429252624512, "reward_std": 0.36967799067497253, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.36967799067497253, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 438.75, "completions/mean_terminated_length": 438.75, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.4285187234827523, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.0527484726626426, "learning_rate": 1.412377025435257e-05, "loss": 0.0021, "num_tokens": 18173726.0, "reward": 1.6810344457626343, "reward_std": 0.6905781626701355, "rewards/fixed_code_pass_all_test_reward/mean": 0.8060344457626343, "rewards/fixed_code_pass_all_test_reward/std": 0.34871575236320496, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.42870319129311935, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0974280065856874, "learning_rate": 1.4117902173999867e-05, "loss": 0.0039, "num_tokens": 18181410.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.4288876591034864, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.08484070841223001, "learning_rate": 1.4112032385630266e-05, "loss": 0.0034, "num_tokens": 18189434.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 95.625, "completions/mean_terminated_length": 95.625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.42907212691385355, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.11901609832420945, "learning_rate": 1.4106160891678422e-05, "loss": 0.0048, "num_tokens": 18194863.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.4292565947242206, "frac_reward_zero_std": 1.0, "grad_norm": 0.79296875, "kl": 0.1537151699885726, "learning_rate": 1.4100287694579711e-05, "loss": 0.0061, "num_tokens": 18205310.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 175.375, "completions/mean_terminated_length": 175.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.4294410625345877, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.1043899729847908, "learning_rate": 1.40944127967702e-05, "loss": 0.0042, "num_tokens": 18212225.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 540.125, "completions/mean_terminated_length": 540.125, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.4296255303449548, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.06921311980113387, "learning_rate": 1.4088536200686682e-05, "loss": 0.0028, "num_tokens": 18221986.0, "reward": 1.8250000476837158, "reward_std": 0.36154431104660034, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 217.75, "completions/mean_terminated_length": 217.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.4298099981553219, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.10878137592226267, "learning_rate": 1.408265790876663e-05, "loss": 0.0044, "num_tokens": 18229688.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.42999446596568897, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.16702771931886673, "learning_rate": 1.4076777923448242e-05, "loss": 0.0067, "num_tokens": 18238758.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 207.125, "completions/mean_terminated_length": 207.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.4301789337760561, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.08494161139242351, "learning_rate": 1.4070896247170403e-05, "loss": 0.0034, "num_tokens": 18243655.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 369.625, "completions/mean_terminated_length": 369.625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.43036340158642317, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.09103292692452669, "learning_rate": 1.4065012882372706e-05, "loss": 0.0036, "num_tokens": 18251956.0, "reward": 1.8499999046325684, "reward_std": 0.07766428589820862, "rewards/fixed_code_pass_all_test_reward/mean": 0.8499999642372131, "rewards/fixed_code_pass_all_test_reward/std": 0.077664315700531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.43054786939679024, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.07002337160520256, "learning_rate": 1.4059127831495445e-05, "loss": 0.0028, "num_tokens": 18258172.0, "reward": 1.3206521272659302, "reward_std": 0.2948266863822937, "rewards/fixed_code_pass_all_test_reward/mean": 0.32065218687057495, "rewards/fixed_code_pass_all_test_reward/std": 0.2948266863822937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 287.0, "completions/mean_terminated_length": 287.0, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.43073233720715737, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.15389314340427518, "learning_rate": 1.4053241096979611e-05, "loss": 0.0062, "num_tokens": 18265476.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 231.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.43091680501752444, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.14215630409307778, "learning_rate": 1.4047352681266894e-05, "loss": 0.0057, "num_tokens": 18270851.0, "reward": 1.8392857313156128, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.4311012728278915, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.2658307449892163, "learning_rate": 1.4041462586799688e-05, "loss": 0.0106, "num_tokens": 18278518.0, "reward": 0.5909091234207153, "reward_std": 0.6317084431648254, "rewards/fixed_code_pass_all_test_reward/mean": 0.09090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.09718590974807739, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.43128574063825864, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.18741378467530012, "learning_rate": 1.4035570816021066e-05, "loss": 0.0075, "num_tokens": 18283267.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.4314702084486257, "frac_reward_zero_std": 1.0, "grad_norm": 0.21875, "kl": 0.1799257853999734, "learning_rate": 1.4029677371374817e-05, "loss": 0.0072, "num_tokens": 18287690.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4316546762589928, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.26422018092125654, "learning_rate": 1.4023782255305409e-05, "loss": 0.0106, "num_tokens": 18295803.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.4318391440693599, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.27259211987257004, "learning_rate": 1.4017885470258012e-05, "loss": 0.0109, "num_tokens": 18303117.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 385.0, "completions/mean_terminated_length": 385.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.432023611879727, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.21864182129502296, "learning_rate": 1.4011987018678481e-05, "loss": 0.0087, "num_tokens": 18310981.0, "reward": 1.5347222089767456, "reward_std": 0.5127884149551392, "rewards/fixed_code_pass_all_test_reward/mean": 0.6597222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.3166283667087555, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.43220807969009406, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.27462377957999706, "learning_rate": 1.4006086903013371e-05, "loss": 0.011, "num_tokens": 18319960.0, "reward": 1.5749999284744263, "reward_std": 0.6363961100578308, "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.4323925475004612, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.325789213180542, "learning_rate": 1.4000185125709919e-05, "loss": 0.013, "num_tokens": 18326911.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 312.125, "completions/mean_terminated_length": 312.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.43257701531082826, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.2010038923472166, "learning_rate": 1.3994281689216058e-05, "loss": 0.008, "num_tokens": 18333376.0, "reward": 1.5416667461395264, "reward_std": 0.364604651927948, "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3646046221256256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.43276148312119533, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.322139760479331, "learning_rate": 1.3988376595980402e-05, "loss": 0.0129, "num_tokens": 18338702.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.43294595093156246, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.28393747424706817, "learning_rate": 1.398246984845226e-05, "loss": 0.0114, "num_tokens": 18346321.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 612.375, "completions/mean_terminated_length": 612.375, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.43313041874192953, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.22782360017299652, "learning_rate": 1.3976561449081623e-05, "loss": 0.0091, "num_tokens": 18362300.0, "reward": 1.1157407760620117, "reward_std": 0.09584242850542068, "rewards/fixed_code_pass_all_test_reward/mean": 0.11574073135852814, "rewards/fixed_code_pass_all_test_reward/std": 0.09584243595600128, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.4333148865522966, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.21880211308598518, "learning_rate": 1.3970651400319164e-05, "loss": 0.0088, "num_tokens": 18369184.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 246.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.43349935436266374, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.3263256084173918, "learning_rate": 1.3964739704616248e-05, "loss": 0.0131, "num_tokens": 18377598.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.4336838221730308, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.3049841672182083, "learning_rate": 1.3958826364424913e-05, "loss": 0.0122, "num_tokens": 18386614.0, "reward": 1.6160714626312256, "reward_std": 0.28105252981185913, "rewards/fixed_code_pass_all_test_reward/mean": 0.6160714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.28105252981185913, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.4338682899833979, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.34668005257844925, "learning_rate": 1.3952911382197891e-05, "loss": 0.0139, "num_tokens": 18409115.0, "reward": 1.8260542154312134, "reward_std": 0.3386598229408264, "rewards/fixed_code_pass_all_test_reward/mean": 0.8260542154312134, "rewards/fixed_code_pass_all_test_reward/std": 0.3386598527431488, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 237.125, "completions/mean_terminated_length": 237.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.434052757793765, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.18450233712792397, "learning_rate": 1.3946994760388584e-05, "loss": 0.0074, "num_tokens": 18413964.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 220.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.4342372256041321, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.3307063039392233, "learning_rate": 1.3941076501451081e-05, "loss": 0.0132, "num_tokens": 18423365.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 445.0, "completions/mean_terminated_length": 216.00001525878906, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.43442169341449915, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.2805747219827026, "learning_rate": 1.3935156607840153e-05, "loss": 0.0112, "num_tokens": 18432597.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.4346061612248663, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.2531166719272733, "learning_rate": 1.3929235082011234e-05, "loss": 0.0101, "num_tokens": 18442141.0, "reward": 0.9464285373687744, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 113.375, "completions/mean_terminated_length": 113.375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.43479062903523336, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.2607689704746008, "learning_rate": 1.3923311926420452e-05, "loss": 0.0104, "num_tokens": 18448832.0, "reward": 1.0681817531585693, "reward_std": 0.0420827642083168, "rewards/fixed_code_pass_all_test_reward/mean": 0.06818182021379471, "rewards/fixed_code_pass_all_test_reward/std": 0.042082734405994415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.43497509684560043, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.2856887746602297, "learning_rate": 1.3917387143524603e-05, "loss": 0.0114, "num_tokens": 18455019.0, "reward": 1.578125, "reward_std": 0.13258251547813416, "rewards/fixed_code_pass_all_test_reward/mean": 0.578125, "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.43515956465596756, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.22068411391228437, "learning_rate": 1.391146073578116e-05, "loss": 0.0088, "num_tokens": 18460969.0, "reward": 1.6416666507720947, "reward_std": 0.3886280357837677, "rewards/fixed_code_pass_all_test_reward/mean": 0.6416666507720947, "rewards/fixed_code_pass_all_test_reward/std": 0.3886280655860901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.43534403246633463, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.32364399544894695, "learning_rate": 1.3905532705648264e-05, "loss": 0.0129, "num_tokens": 18469303.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 307.375, "completions/mean_terminated_length": 307.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.4355285002767017, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.13101169001311064, "learning_rate": 1.3899603055584741e-05, "loss": 0.0052, "num_tokens": 18475010.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 335.875, "completions/mean_terminated_length": 335.875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.43571296808706883, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.23417114838957787, "learning_rate": 1.3893671788050073e-05, "loss": 0.0094, "num_tokens": 18482737.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.4358974358974359, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.259644215926528, "learning_rate": 1.3887738905504431e-05, "loss": 0.0104, "num_tokens": 18490727.0, "reward": 1.5464285612106323, "reward_std": 0.3372630178928375, "rewards/fixed_code_pass_all_test_reward/mean": 0.5464285612106323, "rewards/fixed_code_pass_all_test_reward/std": 0.3372630178928375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.436081903707803, "frac_reward_zero_std": 1.0, "grad_norm": 0.2060546875, "kl": 0.2305601481348276, "learning_rate": 1.3881804410408641e-05, "loss": 0.0092, "num_tokens": 18494629.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.4362663715181701, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.27143711410462856, "learning_rate": 1.3875868305224205e-05, "loss": 0.0109, "num_tokens": 18503342.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 440.0, "completions/mean_terminated_length": 440.0, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.4364508393285372, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.1594833480194211, "learning_rate": 1.386993059241329e-05, "loss": 0.0064, "num_tokens": 18512790.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.43663530713890425, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.1856024218723178, "learning_rate": 1.3863991274438733e-05, "loss": 0.0074, "num_tokens": 18522269.0, "reward": 1.7219388484954834, "reward_std": 0.7001888751983643, "rewards/fixed_code_pass_all_test_reward/mean": 0.8469387888908386, "rewards/fixed_code_pass_all_test_reward/std": 0.35111531615257263, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.4368197749492714, "frac_reward_zero_std": 0.0, "grad_norm": 3.53125, "kl": 0.3162971567362547, "learning_rate": 1.3858050353764032e-05, "loss": 0.0127, "num_tokens": 18528826.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.43700424275963845, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.14776811376214027, "learning_rate": 1.3852107832853356e-05, "loss": 0.0059, "num_tokens": 18533468.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.4371887105700055, "frac_reward_zero_std": 0.0, "grad_norm": 3.9375, "kl": 0.2739961463958025, "learning_rate": 1.3846163714171532e-05, "loss": 0.011, "num_tokens": 18539849.0, "reward": 1.4375, "reward_std": 0.7763237953186035, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 406.5, "completions/mean_terminated_length": 406.5, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.43737317838037265, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.1688932068645954, "learning_rate": 1.3840218000184053e-05, "loss": 0.0068, "num_tokens": 18548469.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4375576461907397, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.21135763637721539, "learning_rate": 1.3834270693357073e-05, "loss": 0.0085, "num_tokens": 18555693.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4377421140011068, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.23831075732596219, "learning_rate": 1.3828321796157408e-05, "loss": 0.0095, "num_tokens": 18559549.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 131.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.4379265818114739, "frac_reward_zero_std": 1.0, "grad_norm": 0.1767578125, "kl": 0.18913041427731514, "learning_rate": 1.3822371311052525e-05, "loss": 0.0076, "num_tokens": 18567829.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 199.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.438111049621841, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.1896161511540413, "learning_rate": 1.3816419240510566e-05, "loss": 0.0076, "num_tokens": 18573444.0, "reward": 1.646505355834961, "reward_std": 0.48859068751335144, "rewards/fixed_code_pass_all_test_reward/mean": 0.7715053558349609, "rewards/fixed_code_pass_all_test_reward/std": 0.4230898320674896, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.43829551743220807, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.14107622671872377, "learning_rate": 1.3810465587000314e-05, "loss": 0.0056, "num_tokens": 18578255.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 267.625, "completions/mean_terminated_length": 267.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.43847998524257514, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.14239806402474642, "learning_rate": 1.3804510352991223e-05, "loss": 0.0057, "num_tokens": 18583284.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.43866445305294227, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.2787378951907158, "learning_rate": 1.379855354095339e-05, "loss": 0.0111, "num_tokens": 18588869.0, "reward": 1.564849615097046, "reward_std": 0.2940838932991028, "rewards/fixed_code_pass_all_test_reward/mean": 0.5648496150970459, "rewards/fixed_code_pass_all_test_reward/std": 0.29408392310142517, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.43884892086330934, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.15578968357294798, "learning_rate": 1.3792595153357574e-05, "loss": 0.0062, "num_tokens": 18597873.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.4390333886736764, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.1466474561020732, "learning_rate": 1.3786635192675184e-05, "loss": 0.0059, "num_tokens": 18602016.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.43921785648404355, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.12890774384140968, "learning_rate": 1.3780673661378285e-05, "loss": 0.0052, "num_tokens": 18609983.0, "reward": 1.7564935684204102, "reward_std": 0.17183707654476166, "rewards/fixed_code_pass_all_test_reward/mean": 0.7564934492111206, "rewards/fixed_code_pass_all_test_reward/std": 0.17183709144592285, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4394023242944106, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.1428692671470344, "learning_rate": 1.3774710561939593e-05, "loss": 0.0057, "num_tokens": 18615695.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4395867921047777, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.1795412627980113, "learning_rate": 1.3768745896832464e-05, "loss": 0.0072, "num_tokens": 18620951.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 326.375, "completions/mean_terminated_length": 326.375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.4397712599151448, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.09315017005428672, "learning_rate": 1.3762779668530925e-05, "loss": 0.0037, "num_tokens": 18632026.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.4399557277255119, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.06519286264665425, "learning_rate": 1.3756811879509629e-05, "loss": 0.0026, "num_tokens": 18637037.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.44014019553587896, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.09086298383772373, "learning_rate": 1.3750842532243892e-05, "loss": 0.0036, "num_tokens": 18641972.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.4403246633462461, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.11173389153555036, "learning_rate": 1.3744871629209663e-05, "loss": 0.0045, "num_tokens": 18650726.0, "reward": 1.922727346420288, "reward_std": 0.008416539058089256, "rewards/fixed_code_pass_all_test_reward/mean": 0.9227272868156433, "rewards/fixed_code_pass_all_test_reward/std": 0.008416538126766682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.44050913115661317, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.10862489510327578, "learning_rate": 1.3738899172883555e-05, "loss": 0.0043, "num_tokens": 18661119.0, "reward": 1.0138888359069824, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.013888888992369175, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.44069359896698024, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.07818644400686026, "learning_rate": 1.3732925165742805e-05, "loss": 0.0031, "num_tokens": 18671917.0, "reward": 1.2664473056793213, "reward_std": 0.5117220282554626, "rewards/fixed_code_pass_all_test_reward/mean": 0.39144736528396606, "rewards/fixed_code_pass_all_test_reward/std": 0.15816862881183624, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.44087806677734737, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.08197860512882471, "learning_rate": 1.3726949610265302e-05, "loss": 0.0033, "num_tokens": 18680702.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 661.0, "completions/mean_terminated_length": 661.0, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.44106253458771444, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.056136388797312975, "learning_rate": 1.3720972508929582e-05, "loss": 0.0022, "num_tokens": 18697062.0, "reward": 1.0277776718139648, "reward_std": 0.5598598122596741, "rewards/fixed_code_pass_all_test_reward/mean": 0.2777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.2383890002965927, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.4412470023980815, "frac_reward_zero_std": 1.0, "grad_norm": 0.236328125, "kl": 0.07745602447539568, "learning_rate": 1.3714993864214813e-05, "loss": 0.0031, "num_tokens": 18700905.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 90.375, "completions/mean_terminated_length": 90.375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.44143147020844864, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.0894260979257524, "learning_rate": 1.3709013678600813e-05, "loss": 0.0036, "num_tokens": 18707396.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 199.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.4416159380188157, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.07370264455676079, "learning_rate": 1.3703031954568027e-05, "loss": 0.0029, "num_tokens": 18713096.0, "reward": 1.456730842590332, "reward_std": 0.12238384783267975, "rewards/fixed_code_pass_all_test_reward/mean": 0.45673078298568726, "rewards/fixed_code_pass_all_test_reward/std": 0.12238387763500214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.4418004058291828, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.04763497319072485, "learning_rate": 1.3697048694597555e-05, "loss": 0.0019, "num_tokens": 18717545.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.4419848736395499, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.08479054411873221, "learning_rate": 1.3691063901171116e-05, "loss": 0.0034, "num_tokens": 18727130.0, "reward": 1.5833333730697632, "reward_std": 0.34503278136253357, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 401.75, "completions/mean_terminated_length": 401.75, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.442169341449917, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0720343654975295, "learning_rate": 1.368507757677107e-05, "loss": 0.0029, "num_tokens": 18736048.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.44235380926028406, "frac_reward_zero_std": 1.0, "grad_norm": 7.0, "kl": 0.449614554643631, "learning_rate": 1.3679089723880427e-05, "loss": 0.018, "num_tokens": 18740261.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 296.25, "completions/mean_terminated_length": 296.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4425382770706512, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.06819342158269137, "learning_rate": 1.3673100344982809e-05, "loss": 0.0027, "num_tokens": 18752567.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.44272274488101826, "frac_reward_zero_std": 1.0, "grad_norm": 0.2314453125, "kl": 0.13489667186513543, "learning_rate": 1.3667109442562485e-05, "loss": 0.0054, "num_tokens": 18757575.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.44290721269138533, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.12358535639941692, "learning_rate": 1.366111701910435e-05, "loss": 0.0049, "num_tokens": 18769188.0, "reward": 1.0711207389831543, "reward_std": 0.02149001881480217, "rewards/fixed_code_pass_all_test_reward/mean": 0.07112069427967072, "rewards/fixed_code_pass_all_test_reward/std": 0.021490059792995453, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 278.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.44309168050175246, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.053720123367384076, "learning_rate": 1.3655123077093934e-05, "loss": 0.0021, "num_tokens": 18778645.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.44327614831211953, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.06505869957618415, "learning_rate": 1.3649127619017395e-05, "loss": 0.0026, "num_tokens": 18782650.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4434606161224866, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.09342577029019594, "learning_rate": 1.3643130647361519e-05, "loss": 0.0037, "num_tokens": 18789714.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.44364508393285373, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.15652556577697396, "learning_rate": 1.3637132164613717e-05, "loss": 0.0063, "num_tokens": 18800740.0, "reward": 1.9910714626312256, "reward_std": 0.025253789499402046, "rewards/fixed_code_pass_all_test_reward/mean": 0.9910714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.025253823027014732, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.4438295517432208, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.08519625524058938, "learning_rate": 1.3631132173262034e-05, "loss": 0.0034, "num_tokens": 18808945.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.4440140195535879, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.08928339136764407, "learning_rate": 1.3625130675795135e-05, "loss": 0.0036, "num_tokens": 18818490.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 329.0, "completions/mean_terminated_length": 329.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.444198487363955, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.07183677842840552, "learning_rate": 1.3619127674702312e-05, "loss": 0.0029, "num_tokens": 18824970.0, "reward": 1.359375, "reward_std": 0.6175900101661682, "rewards/fixed_code_pass_all_test_reward/mean": 0.484375, "rewards/fixed_code_pass_all_test_reward/std": 0.3435470163822174, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.4443829551743221, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.08551887795329094, "learning_rate": 1.3613123172473484e-05, "loss": 0.0034, "num_tokens": 18835330.0, "reward": 1.659999966621399, "reward_std": 0.4771343171596527, "rewards/fixed_code_pass_all_test_reward/mean": 0.6599999666213989, "rewards/fixed_code_pass_all_test_reward/std": 0.4771342873573303, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 134.625, "completions/mean_terminated_length": 134.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.44456742298468915, "frac_reward_zero_std": 0.0, "grad_norm": 3.96875, "kl": 0.1463519069366157, "learning_rate": 1.360711717159918e-05, "loss": 0.0059, "num_tokens": 18840415.0, "reward": 1.8985848426818848, "reward_std": 0.18778426945209503, "rewards/fixed_code_pass_all_test_reward/mean": 0.8985849022865295, "rewards/fixed_code_pass_all_test_reward/std": 0.18778428435325623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.4447518907950563, "frac_reward_zero_std": 1.0, "grad_norm": 0.23046875, "kl": 0.08010721672326326, "learning_rate": 1.360110967457057e-05, "loss": 0.0032, "num_tokens": 18846612.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 402.25, "completions/mean_terminated_length": 402.25, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.44493635860542335, "frac_reward_zero_std": 1.0, "grad_norm": 0.2392578125, "kl": 0.08153816871345043, "learning_rate": 1.3595100683879422e-05, "loss": 0.0033, "num_tokens": 18854270.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 162.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.4451208264157904, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.09894833480939269, "learning_rate": 1.3589090202018148e-05, "loss": 0.004, "num_tokens": 18861473.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.44530529422615756, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.14720860216766596, "learning_rate": 1.3583078231479758e-05, "loss": 0.0059, "num_tokens": 18870808.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.44548976203652463, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.05324527667835355, "learning_rate": 1.3577064774757892e-05, "loss": 0.0021, "num_tokens": 18875038.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.4456742298468917, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.09247010108083487, "learning_rate": 1.35710498343468e-05, "loss": 0.0037, "num_tokens": 18882725.0, "reward": 1.7589285373687744, "reward_std": 0.446785569190979, "rewards/fixed_code_pass_all_test_reward/mean": 0.7589285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.446785569190979, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 450.5, "completions/mean_terminated_length": 450.5, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.44585869765725883, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.04736112500540912, "learning_rate": 1.3565033412741347e-05, "loss": 0.0019, "num_tokens": 18890809.0, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.4460431654676259, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.09252556995488703, "learning_rate": 1.355901551243702e-05, "loss": 0.0037, "num_tokens": 18896314.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 157.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.446227633277993, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.1582537880167365, "learning_rate": 1.3552996135929907e-05, "loss": 0.0063, "num_tokens": 18900436.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.4464121010883601, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.07387904333882034, "learning_rate": 1.3546975285716726e-05, "loss": 0.003, "num_tokens": 18904630.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.4465965688987272, "frac_reward_zero_std": 0.0, "grad_norm": 5.3125, "kl": 0.0887874742038548, "learning_rate": 1.3540952964294784e-05, "loss": 0.0036, "num_tokens": 18908926.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.44678103670909425, "frac_reward_zero_std": 0.0, "grad_norm": 3.71875, "kl": 0.13273318484425545, "learning_rate": 1.353492917416202e-05, "loss": 0.0053, "num_tokens": 18917526.0, "reward": 1.454941987991333, "reward_std": 0.45135098695755005, "rewards/fixed_code_pass_all_test_reward/mean": 0.45494186878204346, "rewards/fixed_code_pass_all_test_reward/std": 0.45135101675987244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 203.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.4469655045194614, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.08398925140500069, "learning_rate": 1.3528903917816965e-05, "loss": 0.0034, "num_tokens": 18922644.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.44714997232982845, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.07608189340680838, "learning_rate": 1.3522877197758765e-05, "loss": 0.003, "num_tokens": 18930930.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.4473344401401955, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.10449591930955648, "learning_rate": 1.351684901648718e-05, "loss": 0.0042, "num_tokens": 18939234.0, "reward": 1.514423131942749, "reward_std": 0.5204757452011108, "rewards/fixed_code_pass_all_test_reward/mean": 0.5144230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.5204757452011108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 134.625, "completions/mean_terminated_length": 134.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.44751890795056265, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.1894116159528494, "learning_rate": 1.3510819376502562e-05, "loss": 0.0076, "num_tokens": 18947607.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.4477033757609297, "frac_reward_zero_std": 1.0, "grad_norm": 0.1875, "kl": 0.06283496227115393, "learning_rate": 1.3504788280305881e-05, "loss": 0.0025, "num_tokens": 18955857.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.4478878435712968, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.08352888282388449, "learning_rate": 1.34987557303987e-05, "loss": 0.0033, "num_tokens": 18961318.0, "reward": 1.75, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.4480723113816639, "frac_reward_zero_std": 1.0, "grad_norm": 0.248046875, "kl": 0.09047797811217606, "learning_rate": 1.3492721729283199e-05, "loss": 0.0036, "num_tokens": 18969316.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.448256779192031, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.08276488701812923, "learning_rate": 1.348668627946214e-05, "loss": 0.0033, "num_tokens": 18978470.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.44844124700239807, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.08499652286991477, "learning_rate": 1.3480649383438908e-05, "loss": 0.0034, "num_tokens": 18982697.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 466.5, "completions/mean_terminated_length": 466.5, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.4486257148127652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.045710650039836764, "learning_rate": 1.347461104371747e-05, "loss": 0.0018, "num_tokens": 18991925.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.44881018262313227, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.09433160861954093, "learning_rate": 1.3468571262802401e-05, "loss": 0.0038, "num_tokens": 18996424.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.44899465043349934, "frac_reward_zero_std": 1.0, "grad_norm": 0.1708984375, "kl": 0.06814114865846932, "learning_rate": 1.3462530043198874e-05, "loss": 0.0027, "num_tokens": 19001761.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.44917911824386647, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.1306453114375472, "learning_rate": 1.3456487387412652e-05, "loss": 0.0052, "num_tokens": 19010960.0, "reward": 1.6296296119689941, "reward_std": 0.3066957890987396, "rewards/fixed_code_pass_all_test_reward/mean": 0.6296296119689941, "rewards/fixed_code_pass_all_test_reward/std": 0.306695818901062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.44936358605423354, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.07625935226678848, "learning_rate": 1.3450443297950102e-05, "loss": 0.003, "num_tokens": 19018910.0, "reward": 1.178571343421936, "reward_std": 0.1266293227672577, "rewards/fixed_code_pass_all_test_reward/mean": 0.1785714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.1266293227672577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4495480538646006, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.0980570986866951, "learning_rate": 1.3444397777318178e-05, "loss": 0.0039, "num_tokens": 19025822.0, "reward": 1.5116279125213623, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5116279125213623, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.44973252167496774, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.06190782482735813, "learning_rate": 1.3438350828024438e-05, "loss": 0.0025, "num_tokens": 19033001.0, "reward": 1.1176470518112183, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.11764705926179886, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.4499169894853348, "frac_reward_zero_std": 1.0, "grad_norm": 0.314453125, "kl": 0.13745060563087463, "learning_rate": 1.3432302452577021e-05, "loss": 0.0055, "num_tokens": 19043836.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4501014572957019, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.06961010466329753, "learning_rate": 1.3426252653484661e-05, "loss": 0.0028, "num_tokens": 19054517.0, "reward": 1.75, "reward_std": 0.1601439267396927, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.1601439118385315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.450285925106069, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.08412629831582308, "learning_rate": 1.342020143325669e-05, "loss": 0.0034, "num_tokens": 19059090.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 211.375, "completions/mean_terminated_length": 211.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4504703929164361, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.07624866208061576, "learning_rate": 1.3414148794403015e-05, "loss": 0.003, "num_tokens": 19063741.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.45065486072680316, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.09209005162119865, "learning_rate": 1.340809473943415e-05, "loss": 0.0037, "num_tokens": 19072463.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.45083932853717024, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.05546656018123031, "learning_rate": 1.3402039270861175e-05, "loss": 0.0022, "num_tokens": 19077168.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.45102379634753736, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.06849043047986925, "learning_rate": 1.339598239119578e-05, "loss": 0.0027, "num_tokens": 19084691.0, "reward": 1.6375000476837158, "reward_std": 0.7116798162460327, "rewards/fixed_code_pass_all_test_reward/mean": 0.762499988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.40451323986053467, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 126.875, "completions/mean_terminated_length": 126.875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.45120826415790444, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.1088853720575571, "learning_rate": 1.3389924102950215e-05, "loss": 0.0044, "num_tokens": 19088554.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.4513927319682715, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.07437328388914466, "learning_rate": 1.338386440863734e-05, "loss": 0.003, "num_tokens": 19098624.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 90.25, "completions/mean_terminated_length": 90.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.45157719977863864, "frac_reward_zero_std": 1.0, "grad_norm": 0.185546875, "kl": 0.13830979866907, "learning_rate": 1.3377803310770573e-05, "loss": 0.0055, "num_tokens": 19102042.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.4517616675890057, "frac_reward_zero_std": 1.0, "grad_norm": 0.21875, "kl": 0.11745353508740664, "learning_rate": 1.3371740811863936e-05, "loss": 0.0047, "num_tokens": 19109405.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.4519461353993728, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.05365047696977854, "learning_rate": 1.336567691443202e-05, "loss": 0.0021, "num_tokens": 19117138.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 104.25, "completions/mean_terminated_length": 104.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.4521306032097399, "frac_reward_zero_std": 1.0, "grad_norm": 0.396484375, "kl": 0.11153360549360514, "learning_rate": 1.3359611620989995e-05, "loss": 0.0045, "num_tokens": 19121524.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 329.875, "completions/mean_terminated_length": 329.875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.452315071020107, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.04811219894327223, "learning_rate": 1.3353544934053618e-05, "loss": 0.0019, "num_tokens": 19128131.0, "reward": 1.671875, "reward_std": 0.46740877628326416, "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, "rewards/fixed_code_pass_all_test_reward/std": 0.3892386257648468, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 363.75, "completions/mean_terminated_length": 363.75, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.45249953883047406, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.028371982043609023, "learning_rate": 1.3347476856139217e-05, "loss": 0.0011, "num_tokens": 19139689.0, "reward": 1.2361111640930176, "reward_std": 0.17251642048358917, "rewards/fixed_code_pass_all_test_reward/mean": 0.236111119389534, "rewards/fixed_code_pass_all_test_reward/std": 0.17251639068126678, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 376.0, "completions/mean_terminated_length": 376.0, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.4526840066408412, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.06710508605465293, "learning_rate": 1.3341407389763701e-05, "loss": 0.0027, "num_tokens": 19147913.0, "reward": 1.6057692766189575, "reward_std": 0.24476774036884308, "rewards/fixed_code_pass_all_test_reward/mean": 0.6057692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.24476775527000427, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 347.25, "completions/mean_terminated_length": 347.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.45286847445120826, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.06659013591706753, "learning_rate": 1.3335336537444556e-05, "loss": 0.0027, "num_tokens": 19156523.0, "reward": 1.1304347515106201, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1304347813129425, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.45305294226157533, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "kl": 0.054383386159315705, "learning_rate": 1.3329264301699835e-05, "loss": 0.0022, "num_tokens": 19165718.0, "reward": 1.5416667461395264, "reward_std": 0.18519467115402222, "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.18519464135169983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.45323741007194246, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.10103775560855865, "learning_rate": 1.3323190685048175e-05, "loss": 0.004, "num_tokens": 19176966.0, "reward": 1.1875, "reward_std": 0.07576141506433487, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.07576144486665726, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 221.25, "completions/mean_terminated_length": 221.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.45342187788230953, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.0649414537474513, "learning_rate": 1.331711569000878e-05, "loss": 0.0026, "num_tokens": 19186600.0, "reward": 1.8380682468414307, "reward_std": 0.20088264346122742, "rewards/fixed_code_pass_all_test_reward/mean": 0.8380682468414307, "rewards/fixed_code_pass_all_test_reward/std": 0.20088261365890503, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.4536063456926766, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.08054102933965623, "learning_rate": 1.3311039319101426e-05, "loss": 0.0032, "num_tokens": 19191580.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.45379081350304373, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "kl": 0.06570837227627635, "learning_rate": 1.330496157484646e-05, "loss": 0.0026, "num_tokens": 19195567.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 142.375, "completions/mean_terminated_length": 142.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.4539752813134108, "frac_reward_zero_std": 1.0, "grad_norm": 0.232421875, "kl": 0.11337934341281652, "learning_rate": 1.32988824597648e-05, "loss": 0.0045, "num_tokens": 19203274.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 576.375, "completions/mean_terminated_length": 576.375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.4541597491237779, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.03702275734394789, "learning_rate": 1.329280197637793e-05, "loss": 0.0015, "num_tokens": 19214469.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 264.5, "completions/mean_terminated_length": 264.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.454344216934145, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.09525910392403603, "learning_rate": 1.3286720127207907e-05, "loss": 0.0038, "num_tokens": 19223545.0, "reward": 1.7598040103912354, "reward_std": 0.310337096452713, "rewards/fixed_code_pass_all_test_reward/mean": 0.7598039507865906, "rewards/fixed_code_pass_all_test_reward/std": 0.310337096452713, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.4545286847445121, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.06513310899026692, "learning_rate": 1.3280636914777344e-05, "loss": 0.0026, "num_tokens": 19230298.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.45471315255487915, "frac_reward_zero_std": 0.0, "grad_norm": 3.34375, "kl": 0.1422912608832121, "learning_rate": 1.3274552341609435e-05, "loss": 0.0057, "num_tokens": 19236506.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4548976203652463, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.12110247323289514, "learning_rate": 1.3268466410227923e-05, "loss": 0.0048, "num_tokens": 19244769.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 313.75, "completions/mean_terminated_length": 313.75, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.45508208817561335, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.04298601229675114, "learning_rate": 1.3262379123157121e-05, "loss": 0.0017, "num_tokens": 19252095.0, "reward": 1.9305555820465088, "reward_std": 0.12858611345291138, "rewards/fixed_code_pass_all_test_reward/mean": 0.9305555820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.12858612835407257, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 351.25, "completions/mean_terminated_length": 351.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.4552665559859804, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.04315890418365598, "learning_rate": 1.325629048292191e-05, "loss": 0.0017, "num_tokens": 19258905.0, "reward": 1.3392856121063232, "reward_std": 0.2850758135318756, "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.3306500315666199, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.45545102379634755, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.05726885027252138, "learning_rate": 1.3250200492047718e-05, "loss": 0.0023, "num_tokens": 19266871.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.4556354916067146, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.04380124295130372, "learning_rate": 1.324410915306055e-05, "loss": 0.0018, "num_tokens": 19271856.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.4558199594170817, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.0635080540087074, "learning_rate": 1.3238016468486957e-05, "loss": 0.0025, "num_tokens": 19276536.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.4560044272274488, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.05440603801980615, "learning_rate": 1.3231922440854052e-05, "loss": 0.0022, "num_tokens": 19284903.0, "reward": 1.4025423526763916, "reward_std": 0.333336740732193, "rewards/fixed_code_pass_all_test_reward/mean": 0.4025424122810364, "rewards/fixed_code_pass_all_test_reward/std": 0.3333367705345154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4561888950378159, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.05007569072768092, "learning_rate": 1.322582707268951e-05, "loss": 0.002, "num_tokens": 19289825.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 309.125, "completions/mean_terminated_length": 309.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.456373362848183, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.040785177145153284, "learning_rate": 1.3219730366521555e-05, "loss": 0.0016, "num_tokens": 19295858.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4565578306585501, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.039472236298024654, "learning_rate": 1.3213632324878973e-05, "loss": 0.0016, "num_tokens": 19302403.0, "reward": 1.8518519401550293, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8518518805503845, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.4567422984689172, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.023852482554502785, "learning_rate": 1.3207532950291093e-05, "loss": 0.001, "num_tokens": 19308348.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.45692676627928425, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.05310309026390314, "learning_rate": 1.3201432245287814e-05, "loss": 0.0021, "num_tokens": 19314810.0, "reward": 1.7073863744735718, "reward_std": 0.12114022672176361, "rewards/fixed_code_pass_all_test_reward/mean": 0.7073863744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.12114022672176361, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.4571112340896514, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.037566120037809014, "learning_rate": 1.3195330212399567e-05, "loss": 0.0015, "num_tokens": 19322380.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.45729570190001845, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "kl": 0.0744346589781344, "learning_rate": 1.3189226854157348e-05, "loss": 0.003, "num_tokens": 19327157.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 348.625, "completions/mean_terminated_length": 348.625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.4574801697103855, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.13226413866505027, "learning_rate": 1.31831221730927e-05, "loss": 0.0053, "num_tokens": 19335138.0, "reward": 1.682692289352417, "reward_std": 0.027196446433663368, "rewards/fixed_code_pass_all_test_reward/mean": 0.682692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.45766463752075265, "frac_reward_zero_std": 0.0, "grad_norm": 4.40625, "kl": 0.09098851727321744, "learning_rate": 1.3177016171737707e-05, "loss": 0.0036, "num_tokens": 19344404.0, "reward": 1.2899999618530273, "reward_std": 0.055549152195453644, "rewards/fixed_code_pass_all_test_reward/mean": 0.28999999165534973, "rewards/fixed_code_pass_all_test_reward/std": 0.05554920434951782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.4578491053311197, "frac_reward_zero_std": 1.0, "grad_norm": 0.67578125, "kl": 0.1553835328668356, "learning_rate": 1.3170908852625013e-05, "loss": 0.0062, "num_tokens": 19350973.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.4580335731414868, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.04400401236489415, "learning_rate": 1.3164800218287797e-05, "loss": 0.0018, "num_tokens": 19360509.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4582180409518539, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.05981471762061119, "learning_rate": 1.3158690271259792e-05, "loss": 0.0024, "num_tokens": 19364554.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.458402508762221, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.02758068428374827, "learning_rate": 1.3152579014075268e-05, "loss": 0.0011, "num_tokens": 19371696.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 421.75, "completions/mean_terminated_length": 421.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.45858697657258807, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.03773955604992807, "learning_rate": 1.3146466449269049e-05, "loss": 0.0015, "num_tokens": 19378566.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4587714443829552, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.09447152726352215, "learning_rate": 1.3140352579376488e-05, "loss": 0.0038, "num_tokens": 19387520.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 422.125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.45895591219332227, "frac_reward_zero_std": 1.0, "grad_norm": 0.061279296875, "kl": 0.05271705408813432, "learning_rate": 1.3134237406933493e-05, "loss": 0.0021, "num_tokens": 19395913.0, "reward": 1.923076868057251, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9230769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 531.125, "completions/mean_terminated_length": 531.125, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.45914038000368934, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.05102872010320425, "learning_rate": 1.3128120934476499e-05, "loss": 0.002, "num_tokens": 19413666.0, "reward": 1.5625, "reward_std": 0.7081582546234131, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.42433422803878784, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 94.625, "completions/mean_terminated_length": 94.625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.45932484781405647, "frac_reward_zero_std": 0.0, "grad_norm": 3.453125, "kl": 0.06408467143774033, "learning_rate": 1.312200316454249e-05, "loss": 0.0026, "num_tokens": 19420655.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.45950931562442354, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.05304920766502619, "learning_rate": 1.3115884099668985e-05, "loss": 0.0021, "num_tokens": 19427790.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 281.125, "completions/mean_terminated_length": 281.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4596937834347906, "frac_reward_zero_std": 0.0, "grad_norm": 18.0, "kl": 1.617829819675535, "learning_rate": 1.3109763742394037e-05, "loss": 0.0647, "num_tokens": 19436799.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 303.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.45987825124515774, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.06304322509095073, "learning_rate": 1.3103642095256244e-05, "loss": 0.0025, "num_tokens": 19446660.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.4600627190555248, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.07168955495581031, "learning_rate": 1.3097519160794726e-05, "loss": 0.0029, "num_tokens": 19452950.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.4602471868658919, "frac_reward_zero_std": 0.0, "grad_norm": 3.828125, "kl": 0.039973490638658404, "learning_rate": 1.3091394941549149e-05, "loss": 0.0016, "num_tokens": 19461668.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 294.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.460431654676259, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.0715390108525753, "learning_rate": 1.308526944005971e-05, "loss": 0.0029, "num_tokens": 19475546.0, "reward": 1.8953487873077393, "reward_std": 0.14655643701553345, "rewards/fixed_code_pass_all_test_reward/mean": 0.8953487873077393, "rewards/fixed_code_pass_all_test_reward/std": 0.14655643701553345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 203.25, "completions/mean_terminated_length": 203.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4606161224866261, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.03440588910598308, "learning_rate": 1.3079142658867124e-05, "loss": 0.0014, "num_tokens": 19481148.0, "reward": 1.7586207389831543, "reward_std": 0.09753197431564331, "rewards/fixed_code_pass_all_test_reward/mean": 0.7586206793785095, "rewards/fixed_code_pass_all_test_reward/std": 0.09753198176622391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 491.375, "completions/mean_terminated_length": 491.375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.46080059029699316, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.05455622193403542, "learning_rate": 1.3073014600512654e-05, "loss": 0.0022, "num_tokens": 19495455.0, "reward": 1.5532786846160889, "reward_std": 0.7302380204200745, "rewards/fixed_code_pass_all_test_reward/mean": 0.6782786846160889, "rewards/fixed_code_pass_all_test_reward/std": 0.4630914628505707, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.4609850581073603, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.06572955404408276, "learning_rate": 1.3066885267538088e-05, "loss": 0.0026, "num_tokens": 19503141.0, "reward": 1.962499976158142, "reward_std": 0.07440242171287537, "rewards/fixed_code_pass_all_test_reward/mean": 0.9624999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.07440237700939178, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.46116952591772736, "frac_reward_zero_std": 1.0, "grad_norm": 0.71484375, "kl": 0.07924494612962008, "learning_rate": 1.306075466248574e-05, "loss": 0.0032, "num_tokens": 19507726.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.46135399372809444, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.08651424432173371, "learning_rate": 1.305462278789845e-05, "loss": 0.0035, "num_tokens": 19516852.0, "reward": 1.6622340679168701, "reward_std": 0.35391902923583984, "rewards/fixed_code_pass_all_test_reward/mean": 0.6622340679168701, "rewards/fixed_code_pass_all_test_reward/std": 0.35391902923583984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.46153846153846156, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.077399758156389, "learning_rate": 1.3048489646319589e-05, "loss": 0.0031, "num_tokens": 19525576.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.46172292934882864, "frac_reward_zero_std": 0.0, "grad_norm": 4.4375, "kl": 0.31580981193110347, "learning_rate": 1.304235524029305e-05, "loss": 0.0126, "num_tokens": 19532620.0, "reward": 1.829545497894287, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.9545454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 463.75, "completions/mean_terminated_length": 463.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.4619073971591957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0419921875, "kl": 0.026968541438691318, "learning_rate": 1.3036219572363255e-05, "loss": 0.0011, "num_tokens": 19540658.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.46209186496956284, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.06700132554396987, "learning_rate": 1.3030082645075144e-05, "loss": 0.0027, "num_tokens": 19549057.0, "reward": 1.1875, "reward_std": 0.5303300619125366, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.4622763327799299, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.07916123000904918, "learning_rate": 1.3023944460974183e-05, "loss": 0.0032, "num_tokens": 19556481.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 511.5, "completions/mean_terminated_length": 511.5, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.462460800590297, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.022993050748482347, "learning_rate": 1.301780502260636e-05, "loss": 0.0009, "num_tokens": 19575861.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 569.75, "completions/mean_terminated_length": 569.75, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.4626452684006641, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.04371033306233585, "learning_rate": 1.3011664332518178e-05, "loss": 0.0017, "num_tokens": 19587355.0, "reward": 1.2291666269302368, "reward_std": 0.3204349875450134, "rewards/fixed_code_pass_all_test_reward/mean": 0.2291666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.32043495774269104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 156.0, "completions/mean_terminated_length": 156.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4628297362110312, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.061116638127714396, "learning_rate": 1.3005522393256667e-05, "loss": 0.0024, "num_tokens": 19596523.0, "reward": 0.8787878751754761, "reward_std": 0.35524219274520874, "rewards/fixed_code_pass_all_test_reward/mean": 0.0037878789007663727, "rewards/fixed_code_pass_all_test_reward/std": 0.010713739320635796, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.46301420402139826, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.10051394999027252, "learning_rate": 1.2999379207369365e-05, "loss": 0.004, "num_tokens": 19605263.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 546.875, "completions/mean_terminated_length": 546.875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.46319867183176533, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.05699417903088033, "learning_rate": 1.2993234777404336e-05, "loss": 0.0023, "num_tokens": 19619654.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.46338313964213246, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.08157179690897465, "learning_rate": 1.2987089105910155e-05, "loss": 0.0033, "num_tokens": 19627061.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.46356760745249953, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.05523867590818554, "learning_rate": 1.2980942195435922e-05, "loss": 0.0022, "num_tokens": 19637727.0, "reward": 1.7625000476837158, "reward_std": 0.4470139443874359, "rewards/fixed_code_pass_all_test_reward/mean": 0.887499988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.27998724579811096, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4637520752628666, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.043351505883038044, "learning_rate": 1.2974794048531232e-05, "loss": 0.0017, "num_tokens": 19641937.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 689.375, "completions/mean_terminated_length": 689.375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.46393654307323373, "frac_reward_zero_std": 1.0, "grad_norm": 0.044921875, "kl": 0.031737949000671506, "learning_rate": 1.2968644667746207e-05, "loss": 0.0013, "num_tokens": 19662708.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 434.375, "completions/mean_terminated_length": 434.375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.4641210108836008, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.03404521639458835, "learning_rate": 1.2962494055631481e-05, "loss": 0.0014, "num_tokens": 19670143.0, "reward": 1.6964285373687744, "reward_std": 0.7016744017601013, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.3642157018184662, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.4643054786939679, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.059438387397676706, "learning_rate": 1.2956342214738188e-05, "loss": 0.0024, "num_tokens": 19676367.0, "reward": 1.6764705181121826, "reward_std": 0.6903042793273926, "rewards/fixed_code_pass_all_test_reward/mean": 0.8014705777168274, "rewards/fixed_code_pass_all_test_reward/std": 0.35004058480262756, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.464489946504335, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.045052382396534085, "learning_rate": 1.2950189147617988e-05, "loss": 0.0018, "num_tokens": 19680181.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.4646744143147021, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.06967951031401753, "learning_rate": 1.2944034856823033e-05, "loss": 0.0028, "num_tokens": 19691292.0, "reward": 1.3229166269302368, "reward_std": 0.3131937086582184, "rewards/fixed_code_pass_all_test_reward/mean": 0.3229166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.313193678855896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.46485888212506915, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.11151231173425913, "learning_rate": 1.2937879344905995e-05, "loss": 0.0045, "num_tokens": 19698372.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 452.25, "completions/mean_terminated_length": 452.25, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.4650433499354363, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.05091506754979491, "learning_rate": 1.2931722614420041e-05, "loss": 0.002, "num_tokens": 19711078.0, "reward": 1.9110169410705566, "reward_std": 0.035954561084508896, "rewards/fixed_code_pass_all_test_reward/mean": 0.9110169410705566, "rewards/fixed_code_pass_all_test_reward/std": 0.035954590886831284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.46522781774580335, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.04728512302972376, "learning_rate": 1.2925564667918857e-05, "loss": 0.0019, "num_tokens": 19717583.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 113.125, "completions/mean_terminated_length": 113.125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.4654122855561704, "frac_reward_zero_std": 1.0, "grad_norm": 0.40625, "kl": 0.1569471675902605, "learning_rate": 1.2919405507956623e-05, "loss": 0.0063, "num_tokens": 19723600.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 588.625, "completions/mean_terminated_length": 588.625, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.46559675336653755, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.06591549050062895, "learning_rate": 1.2913245137088024e-05, "loss": 0.0026, "num_tokens": 19739221.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.4657812211769046, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06458414089865983, "learning_rate": 1.2907083557868251e-05, "loss": 0.0026, "num_tokens": 19748712.0, "reward": 1.850000023841858, "reward_std": 0.2777460217475891, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2777460217475891, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.4659656889872717, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06744303973391652, "learning_rate": 1.2900920772852992e-05, "loss": 0.0027, "num_tokens": 19758240.0, "reward": 1.0367646217346191, "reward_std": 0.006932409945875406, "rewards/fixed_code_pass_all_test_reward/mean": 0.036764707416296005, "rewards/fixed_code_pass_all_test_reward/std": 0.00693241972476244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.4661501567976388, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.05038240528665483, "learning_rate": 1.289475678459844e-05, "loss": 0.002, "num_tokens": 19763276.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.4663346246080059, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.04504129965789616, "learning_rate": 1.2888591595661281e-05, "loss": 0.0018, "num_tokens": 19767588.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.46651909241837297, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.07204801449552178, "learning_rate": 1.2882425208598708e-05, "loss": 0.0029, "num_tokens": 19779579.0, "reward": 1.274999976158142, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 500.625, "completions/mean_terminated_length": 500.625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.4667035602287401, "frac_reward_zero_std": 1.0, "grad_norm": 0.041015625, "kl": 0.02046745241386816, "learning_rate": 1.2876257625968398e-05, "loss": 0.0008, "num_tokens": 19788920.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.4668880280391072, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.04973413376137614, "learning_rate": 1.2870088850328535e-05, "loss": 0.002, "num_tokens": 19796449.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.46707249584947425, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.0761973054613918, "learning_rate": 1.2863918884237794e-05, "loss": 0.003, "num_tokens": 19801708.0, "reward": 1.763157844543457, "reward_std": 0.21859537065029144, "rewards/fixed_code_pass_all_test_reward/mean": 0.763157844543457, "rewards/fixed_code_pass_all_test_reward/std": 0.21859537065029144, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.4672569636598414, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.05019446928054094, "learning_rate": 1.285774773025534e-05, "loss": 0.002, "num_tokens": 19806232.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.46744143147020845, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.04473037726711482, "learning_rate": 1.2851575390940837e-05, "loss": 0.0018, "num_tokens": 19813909.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 307.875, "completions/mean_terminated_length": 307.875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.4676258992805755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.037740444764494896, "learning_rate": 1.2845401868854439e-05, "loss": 0.0015, "num_tokens": 19820956.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 416.875, "completions/mean_terminated_length": 416.875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.46781036709094265, "frac_reward_zero_std": 1.0, "grad_norm": 0.05322265625, "kl": 0.03706486476585269, "learning_rate": 1.283922716655679e-05, "loss": 0.0015, "num_tokens": 19834387.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 320.75, "completions/mean_terminated_length": 320.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4679948349013097, "frac_reward_zero_std": 1.0, "grad_norm": 0.1982421875, "kl": 0.047755922423675656, "learning_rate": 1.2833051286609024e-05, "loss": 0.0019, "num_tokens": 19845497.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.4681793027116768, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.07160097965970635, "learning_rate": 1.282687423157276e-05, "loss": 0.0029, "num_tokens": 19853351.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.4683637705220439, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.05896443501114845, "learning_rate": 1.2820696004010106e-05, "loss": 0.0024, "num_tokens": 19860049.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.468548238332411, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.04848815035074949, "learning_rate": 1.281451660648366e-05, "loss": 0.0019, "num_tokens": 19865269.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 357.25, "completions/mean_terminated_length": 357.25, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.46873270614277807, "frac_reward_zero_std": 1.0, "grad_norm": 0.04296875, "kl": 0.028645648388192058, "learning_rate": 1.2808336041556504e-05, "loss": 0.0011, "num_tokens": 19872207.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.4689171739531452, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.06584410858340561, "learning_rate": 1.2802154311792196e-05, "loss": 0.0026, "num_tokens": 19879451.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 511.5, "completions/mean_terminated_length": 511.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.46910164176351227, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.04276907071471214, "learning_rate": 1.2795971419754793e-05, "loss": 0.0017, "num_tokens": 19888695.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 328.375, "completions/mean_terminated_length": 328.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.46928610957387934, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06671121949329972, "learning_rate": 1.2789787368008819e-05, "loss": 0.0027, "num_tokens": 19894490.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.46947057738424647, "frac_reward_zero_std": 1.0, "grad_norm": 0.369140625, "kl": 0.08964411658234894, "learning_rate": 1.278360215911929e-05, "loss": 0.0036, "num_tokens": 19898832.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 209.875, "completions/mean_terminated_length": 209.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.46965504519461354, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0326445639366284, "learning_rate": 1.2777415795651692e-05, "loss": 0.0013, "num_tokens": 19905015.0, "reward": 1.75, "reward_std": 0.27196410298347473, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.2719641625881195, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.4698395130049806, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.03795015986543149, "learning_rate": 1.2771228280171996e-05, "loss": 0.0015, "num_tokens": 19913625.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.47002398081534774, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.06938596582040191, "learning_rate": 1.2765039615246651e-05, "loss": 0.0028, "num_tokens": 19919923.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 477.625, "completions/mean_terminated_length": 477.625, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.4702084486257148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05712890625, "kl": 0.03389084432274103, "learning_rate": 1.2758849803442583e-05, "loss": 0.0014, "num_tokens": 19933712.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.4703929164360819, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.06271796859800816, "learning_rate": 1.275265884732719e-05, "loss": 0.0025, "num_tokens": 19938181.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 530.625, "completions/mean_terminated_length": 530.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.470577384246449, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.07696580491028726, "learning_rate": 1.2746466749468348e-05, "loss": 0.0031, "num_tokens": 19950858.0, "reward": 1.704545497894287, "reward_std": 0.3628237843513489, "rewards/fixed_code_pass_all_test_reward/mean": 0.7045454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.3628237843513489, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4707618520568161, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.05574874114245176, "learning_rate": 1.2740273512434405e-05, "loss": 0.0022, "num_tokens": 19960404.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.47094631986718316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.04499903577379882, "learning_rate": 1.273407913879418e-05, "loss": 0.0018, "num_tokens": 19968654.0, "reward": 1.7391304969787598, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.739130437374115, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.4711307876775503, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.06945967907086015, "learning_rate": 1.2727883631116969e-05, "loss": 0.0028, "num_tokens": 19977297.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.47131525548791736, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.04515921697020531, "learning_rate": 1.2721686991972533e-05, "loss": 0.0018, "num_tokens": 19981588.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.47149972329828443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.03780234116129577, "learning_rate": 1.2715489223931105e-05, "loss": 0.0015, "num_tokens": 19987098.0, "reward": 1.375, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.47168419110865156, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.08849153807386756, "learning_rate": 1.2709290329563386e-05, "loss": 0.0035, "num_tokens": 19995930.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 165.375, "completions/mean_terminated_length": 165.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.47186865891901864, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.07850931398570538, "learning_rate": 1.270309031144054e-05, "loss": 0.0031, "num_tokens": 20003997.0, "reward": 1.7941176891326904, "reward_std": 0.3812199831008911, "rewards/fixed_code_pass_all_test_reward/mean": 0.7941176891326904, "rewards/fixed_code_pass_all_test_reward/std": 0.3812200427055359, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 258.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.4720531267293857, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.08202125923708081, "learning_rate": 1.2696889172134204e-05, "loss": 0.0033, "num_tokens": 20011899.0, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/fixed_code_pass_all_test_reward/mean": 0.28125, "rewards/fixed_code_pass_all_test_reward/std": 0.24775780737400055, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.47223759453975284, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.05090927565470338, "learning_rate": 1.2690686914216475e-05, "loss": 0.002, "num_tokens": 20017875.0, "reward": 1.33152174949646, "reward_std": 0.046115659177303314, "rewards/fixed_code_pass_all_test_reward/mean": 0.33152174949645996, "rewards/fixed_code_pass_all_test_reward/std": 0.04611566662788391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 490.375, "completions/mean_terminated_length": 490.375, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.4724220623501199, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.03768021357245743, "learning_rate": 1.268448354025992e-05, "loss": 0.0015, "num_tokens": 20030926.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 291.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.472606530160487, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.09544111369177699, "learning_rate": 1.2678279052837557e-05, "loss": 0.0038, "num_tokens": 20041568.0, "reward": 1.8239796161651611, "reward_std": 0.2965334951877594, "rewards/fixed_code_pass_all_test_reward/mean": 0.8239796161651611, "rewards/fixed_code_pass_all_test_reward/std": 0.2965334951877594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 420.625, "completions/mean_terminated_length": 420.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.4727909979708541, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.029407728696241975, "learning_rate": 1.2672073454522877e-05, "loss": 0.0012, "num_tokens": 20049253.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.4729754657812212, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.06644888059236109, "learning_rate": 1.2665866747889828e-05, "loss": 0.0027, "num_tokens": 20059184.0, "reward": 1.3928570747375488, "reward_std": 0.3747367262840271, "rewards/fixed_code_pass_all_test_reward/mean": 0.3928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.3747367262840271, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 336.5, "completions/mean_terminated_length": 336.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.47315993359158826, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.06217698729597032, "learning_rate": 1.2659658935512822e-05, "loss": 0.0025, "num_tokens": 20065556.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.4733444014019554, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.05665436573326588, "learning_rate": 1.265345001996672e-05, "loss": 0.0023, "num_tokens": 20070123.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.47352886921232246, "frac_reward_zero_std": 0.0, "grad_norm": 3.75, "kl": 0.14209725055843592, "learning_rate": 1.2647240003826847e-05, "loss": 0.0057, "num_tokens": 20078861.0, "reward": 0.3392857313156128, "reward_std": 0.6282351016998291, "rewards/fixed_code_pass_all_test_reward/mean": 0.0892857164144516, "rewards/fixed_code_pass_all_test_reward/std": 0.16532501578330994, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 169.875, "completions/mean_terminated_length": 169.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.47371333702268953, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.04620095947757363, "learning_rate": 1.2641028889668987e-05, "loss": 0.0018, "num_tokens": 20087180.0, "reward": 1.6521739959716797, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6521739363670349, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.47389780483305666, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.07360194250941277, "learning_rate": 1.263481668006937e-05, "loss": 0.0029, "num_tokens": 20097418.0, "reward": 1.545454502105713, "reward_std": 0.4859295189380646, "rewards/fixed_code_pass_all_test_reward/mean": 0.5454545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.48592954874038696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 267.0, "completions/mean_terminated_length": 267.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.47408227264342373, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.05930672422982752, "learning_rate": 1.2628603377604694e-05, "loss": 0.0024, "num_tokens": 20103986.0, "reward": 1.3641304969787598, "reward_std": 0.1674860566854477, "rewards/fixed_code_pass_all_test_reward/mean": 0.364130437374115, "rewards/fixed_code_pass_all_test_reward/std": 0.1674860566854477, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 268.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.4742667404537908, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.07382984273135662, "learning_rate": 1.2622388984852094e-05, "loss": 0.003, "num_tokens": 20109113.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.47445120826415793, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.07526638428680599, "learning_rate": 1.2616173504389173e-05, "loss": 0.003, "num_tokens": 20114939.0, "reward": 1.2208333015441895, "reward_std": 0.33187177777290344, "rewards/fixed_code_pass_all_test_reward/mean": 0.22083333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.33187180757522583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.474635676074525, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.04516297020018101, "learning_rate": 1.2609956938793975e-05, "loss": 0.0018, "num_tokens": 20119015.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4748201438848921, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.08471470233052969, "learning_rate": 1.2603739290644996e-05, "loss": 0.0034, "num_tokens": 20125655.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.4750046116952592, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.05128744547255337, "learning_rate": 1.2597520562521188e-05, "loss": 0.0021, "num_tokens": 20134865.0, "reward": 1.975000023841858, "reward_std": 0.04629099741578102, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.04629101976752281, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 317.5, "completions/mean_terminated_length": 317.5, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4751890795056263, "frac_reward_zero_std": 1.0, "grad_norm": 0.043212890625, "kl": 0.031058266176842153, "learning_rate": 1.2591300757001938e-05, "loss": 0.0012, "num_tokens": 20142365.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.47537354731599335, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.07771226670593023, "learning_rate": 1.2585079876667093e-05, "loss": 0.0031, "num_tokens": 20150942.0, "reward": 1.072115421295166, "reward_std": 0.013598235324025154, "rewards/fixed_code_pass_all_test_reward/mean": 0.07211539149284363, "rewards/fixed_code_pass_all_test_reward/std": 0.01359820831567049, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 449.375, "completions/mean_terminated_length": 449.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.4755580151263604, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.05035068187862635, "learning_rate": 1.2578857924096935e-05, "loss": 0.002, "num_tokens": 20163601.0, "reward": 0.9872881174087524, "reward_std": 0.9107030034065247, "rewards/fixed_code_pass_all_test_reward/mean": 0.48728811740875244, "rewards/fixed_code_pass_all_test_reward/std": 0.5219953656196594, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 343.125, "completions/mean_terminated_length": 343.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.47574248293672755, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.11356638884171844, "learning_rate": 1.2572634901872198e-05, "loss": 0.0045, "num_tokens": 20175490.0, "reward": 0.8977272510528564, "reward_std": 0.5675970911979675, "rewards/fixed_code_pass_all_test_reward/mean": 0.39772725105285645, "rewards/fixed_code_pass_all_test_reward/std": 0.27461469173431396, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 515.125, "completions/mean_terminated_length": 515.125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.4759269507470946, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.06148114753887057, "learning_rate": 1.2566410812574057e-05, "loss": 0.0025, "num_tokens": 20188011.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4761114185574617, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.055311979027464986, "learning_rate": 1.2560185658784131e-05, "loss": 0.0022, "num_tokens": 20197501.0, "reward": 1.0714285373687744, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.4762958863678288, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.08241274813190103, "learning_rate": 1.2553959443084478e-05, "loss": 0.0033, "num_tokens": 20205178.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 413.375, "completions/mean_terminated_length": 413.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.4764803541781959, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.06377499736845493, "learning_rate": 1.2547732168057596e-05, "loss": 0.0026, "num_tokens": 20212541.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.47666482198856297, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.04650209238752723, "learning_rate": 1.2541503836286427e-05, "loss": 0.0019, "num_tokens": 20221431.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 357.375, "completions/mean_terminated_length": 357.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.4768492897989301, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.04192106076516211, "learning_rate": 1.2535274450354344e-05, "loss": 0.0017, "num_tokens": 20228274.0, "reward": 1.84375, "reward_std": 0.2893187701702118, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.2893187701702118, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.47703375760929717, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.07558167586103082, "learning_rate": 1.252904401284517e-05, "loss": 0.003, "num_tokens": 20232596.0, "reward": 1.9249999523162842, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.47721822541966424, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.06788758840411901, "learning_rate": 1.2522812526343149e-05, "loss": 0.0027, "num_tokens": 20241092.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.47740269323003137, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.06777821201831102, "learning_rate": 1.251657999343297e-05, "loss": 0.0027, "num_tokens": 20249501.0, "reward": 1.875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 492.625, "completions/mean_terminated_length": 492.625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.47758716104039844, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005859375, "kl": 0.03578619204927236, "learning_rate": 1.2510346416699753e-05, "loss": 0.0014, "num_tokens": 20258290.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 321.125, "completions/mean_terminated_length": 321.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4777716288507655, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.047403352218680084, "learning_rate": 1.250411179872905e-05, "loss": 0.0019, "num_tokens": 20266723.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 812.625, "completions/mean_terminated_length": 812.625, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.47795609666113265, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.039258404984138906, "learning_rate": 1.2497876142106848e-05, "loss": 0.0016, "num_tokens": 20283832.0, "reward": 1.4375, "reward_std": 0.8210402131080627, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.4027436077594757, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 316.625, "completions/mean_terminated_length": 316.625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.4781405644714997, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.03613462974317372, "learning_rate": 1.2491639449419559e-05, "loss": 0.0014, "num_tokens": 20291861.0, "reward": 1.3125, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.4783250322818668, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.05590608646161854, "learning_rate": 1.248540172325403e-05, "loss": 0.0022, "num_tokens": 20299836.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.4785095000922339, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.05412229849025607, "learning_rate": 1.2479162966197534e-05, "loss": 0.0022, "num_tokens": 20305353.0, "reward": 1.0612244606018066, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.06122449040412903, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.478693967902601, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.09874772932380438, "learning_rate": 1.2472923180837777e-05, "loss": 0.0039, "num_tokens": 20311573.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 218.375, "completions/mean_terminated_length": 218.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.47887843571296806, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.05383230000734329, "learning_rate": 1.2466682369762883e-05, "loss": 0.0022, "num_tokens": 20316312.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 86.875, "completions/mean_terminated_length": 86.875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4790629035233352, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.0662866837810725, "learning_rate": 1.246044053556141e-05, "loss": 0.0027, "num_tokens": 20323239.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.47924737133370227, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.050896984059363604, "learning_rate": 1.2454197680822334e-05, "loss": 0.002, "num_tokens": 20330803.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.47943183914406934, "frac_reward_zero_std": 1.0, "grad_norm": 2.890625, "kl": 0.14371900819242, "learning_rate": 1.2447953808135051e-05, "loss": 0.0057, "num_tokens": 20338592.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 304.125, "completions/mean_terminated_length": 304.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.47961630695443647, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.064185458002612, "learning_rate": 1.2441708920089393e-05, "loss": 0.0026, "num_tokens": 20349545.0, "reward": 1.3072917461395264, "reward_std": 0.13717946410179138, "rewards/fixed_code_pass_all_test_reward/mean": 0.3072916567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.13717946410179138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 240.0, "completions/mean_terminated_length": 240.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.47980077476480354, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.04901211918331683, "learning_rate": 1.2435463019275597e-05, "loss": 0.002, "num_tokens": 20354697.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.4799852425751706, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.04711644269991666, "learning_rate": 1.2429216108284334e-05, "loss": 0.0019, "num_tokens": 20359653.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.48016971038553774, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.08042169106192887, "learning_rate": 1.2422968189706686e-05, "loss": 0.0032, "num_tokens": 20367912.0, "reward": 1.4821429252624512, "reward_std": 0.14718502759933472, "rewards/fixed_code_pass_all_test_reward/mean": 0.4821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.14718502759933472, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.4803541781959048, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.08020741958171129, "learning_rate": 1.2416719266134153e-05, "loss": 0.0032, "num_tokens": 20372577.0, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.4805386460062719, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.04891262820456177, "learning_rate": 1.2410469340158655e-05, "loss": 0.002, "num_tokens": 20377394.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.480723113816639, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.07280003046616912, "learning_rate": 1.2404218414372526e-05, "loss": 0.0029, "num_tokens": 20384583.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.4809075816270061, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.08118718350306153, "learning_rate": 1.2397966491368516e-05, "loss": 0.0032, "num_tokens": 20394099.0, "reward": 1.5563380718231201, "reward_std": 0.09493059664964676, "rewards/fixed_code_pass_all_test_reward/mean": 0.5563380122184753, "rewards/fixed_code_pass_all_test_reward/std": 0.09493060410022736, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.48109204943737316, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.07420216547325253, "learning_rate": 1.2391713573739785e-05, "loss": 0.003, "num_tokens": 20398164.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.4812765172477403, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.06373748031910509, "learning_rate": 1.2385459664079914e-05, "loss": 0.0025, "num_tokens": 20404721.0, "reward": 1.75, "reward_std": 0.407033771276474, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4070337414741516, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.48146098505810736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.05387788591906428, "learning_rate": 1.237920476498288e-05, "loss": 0.0022, "num_tokens": 20411614.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.48164545286847443, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.056499975733458996, "learning_rate": 1.2372948879043087e-05, "loss": 0.0023, "num_tokens": 20421619.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 382.25, "completions/mean_terminated_length": 382.25, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.48182992067884156, "frac_reward_zero_std": 1.0, "grad_norm": 0.061279296875, "kl": 0.04989079129882157, "learning_rate": 1.2366692008855341e-05, "loss": 0.002, "num_tokens": 20431141.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.48201438848920863, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.0975995003245771, "learning_rate": 1.2360434157014857e-05, "loss": 0.0039, "num_tokens": 20442260.0, "reward": 1.2943549156188965, "reward_std": 0.36268556118011475, "rewards/fixed_code_pass_all_test_reward/mean": 0.4193548262119293, "rewards/fixed_code_pass_all_test_reward/std": 0.15803158283233643, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.4821988562995757, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.09738655062392354, "learning_rate": 1.2354175326117252e-05, "loss": 0.0039, "num_tokens": 20452448.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.48238332410994283, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "kl": 0.07269073696807027, "learning_rate": 1.234791551875856e-05, "loss": 0.0029, "num_tokens": 20456612.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 329.875, "completions/mean_terminated_length": 329.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.4825677919203099, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.06926806783303618, "learning_rate": 1.2341654737535214e-05, "loss": 0.0028, "num_tokens": 20467203.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 429.125, "completions/mean_terminated_length": 429.125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.482752259730677, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.04573037754744291, "learning_rate": 1.2335392985044044e-05, "loss": 0.0018, "num_tokens": 20481188.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.4829367275410441, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.05107672745361924, "learning_rate": 1.2329130263882294e-05, "loss": 0.002, "num_tokens": 20485215.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 206.5, "completions/mean_terminated_length": 206.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.4831211953514112, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.051320620346814394, "learning_rate": 1.2322866576647607e-05, "loss": 0.0021, "num_tokens": 20490891.0, "reward": 1.4191176891326904, "reward_std": 0.17959420382976532, "rewards/fixed_code_pass_all_test_reward/mean": 0.41911762952804565, "rewards/fixed_code_pass_all_test_reward/std": 0.17959420382976532, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 177.625, "completions/mean_terminated_length": 177.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.48330566316177825, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.09149549389258027, "learning_rate": 1.2316601925938025e-05, "loss": 0.0037, "num_tokens": 20500232.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 354.375, "completions/mean_terminated_length": 354.375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4834901309721454, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.039620644645765424, "learning_rate": 1.2310336314351985e-05, "loss": 0.0016, "num_tokens": 20509355.0, "reward": 1.625, "reward_std": 0.11785116046667099, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 260.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.48367459878251245, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.046551215229555964, "learning_rate": 1.2304069744488333e-05, "loss": 0.0019, "num_tokens": 20517850.0, "reward": 1.0183823108673096, "reward_std": 0.05199316889047623, "rewards/fixed_code_pass_all_test_reward/mean": 0.018382353708148003, "rewards/fixed_code_pass_all_test_reward/std": 0.051993150264024734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.4838590665928795, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.05018661718349904, "learning_rate": 1.2297802218946306e-05, "loss": 0.002, "num_tokens": 20526888.0, "reward": 1.0892857313156128, "reward_std": 0.07393564283847809, "rewards/fixed_code_pass_all_test_reward/mean": 0.0892857164144516, "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.48404353440324666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.05838344804942608, "learning_rate": 1.2291533740325533e-05, "loss": 0.0023, "num_tokens": 20536473.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.48422800221361373, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.09572603087872267, "learning_rate": 1.2285264311226053e-05, "loss": 0.0038, "num_tokens": 20540553.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.4844124700239808, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.0881379172205925, "learning_rate": 1.227899393424828e-05, "loss": 0.0035, "num_tokens": 20548301.0, "reward": 1.6351351737976074, "reward_std": 0.44032377004623413, "rewards/fixed_code_pass_all_test_reward/mean": 0.6351351141929626, "rewards/fixed_code_pass_all_test_reward/std": 0.44032377004623413, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.48459693783434793, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.09719450725242496, "learning_rate": 1.2272722611993038e-05, "loss": 0.0039, "num_tokens": 20554806.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 229.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.484781405644715, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.05712538631632924, "learning_rate": 1.2266450347061532e-05, "loss": 0.0023, "num_tokens": 20559470.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4849658734550821, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0772277417127043, "learning_rate": 1.2260177142055362e-05, "loss": 0.0031, "num_tokens": 20568121.0, "reward": 1.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.4851503412654492, "frac_reward_zero_std": 1.0, "grad_norm": 0.039306640625, "kl": 0.03148372005671263, "learning_rate": 1.2253902999576522e-05, "loss": 0.0013, "num_tokens": 20576143.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 169.25, "completions/mean_terminated_length": 169.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.4853348090758163, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "kl": 0.06574563216418028, "learning_rate": 1.2247627922227387e-05, "loss": 0.0026, "num_tokens": 20580257.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.48551927688618335, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.06422035221476108, "learning_rate": 1.2241351912610726e-05, "loss": 0.0026, "num_tokens": 20587882.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.4857037446965505, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.06303377449512482, "learning_rate": 1.223507497332969e-05, "loss": 0.0025, "num_tokens": 20592267.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.48588821250691755, "frac_reward_zero_std": 0.0, "grad_norm": 3.21875, "kl": 0.0941413133405149, "learning_rate": 1.2228797106987817e-05, "loss": 0.0038, "num_tokens": 20599496.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.4860726803172846, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.05220174859277904, "learning_rate": 1.2222518316189033e-05, "loss": 0.0021, "num_tokens": 20604914.0, "reward": 1.9456522464752197, "reward_std": 0.15371885895729065, "rewards/fixed_code_pass_all_test_reward/mean": 0.945652186870575, "rewards/fixed_code_pass_all_test_reward/std": 0.15371887385845184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.48625714812765175, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.05911744199693203, "learning_rate": 1.2216238603537649e-05, "loss": 0.0024, "num_tokens": 20611503.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 128.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.4864416159380188, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.06760795181617141, "learning_rate": 1.2209957971638344e-05, "loss": 0.0027, "num_tokens": 20618767.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 181.375, "completions/mean_terminated_length": 181.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4866260837483859, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.07167362933978438, "learning_rate": 1.2203676423096199e-05, "loss": 0.0029, "num_tokens": 20624210.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.486810551558753, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.06834456557407975, "learning_rate": 1.2197393960516655e-05, "loss": 0.0027, "num_tokens": 20633214.0, "reward": 1.2946429252624512, "reward_std": 0.12912297248840332, "rewards/fixed_code_pass_all_test_reward/mean": 0.2946428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.12912297248840332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.4869950193691201, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.05727401049807668, "learning_rate": 1.2191110586505548e-05, "loss": 0.0023, "num_tokens": 20637425.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.48717948717948717, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.05439366714563221, "learning_rate": 1.2184826303669084e-05, "loss": 0.0022, "num_tokens": 20648785.0, "reward": 1.7702703475952148, "reward_std": 0.4275789260864258, "rewards/fixed_code_pass_all_test_reward/mean": 0.7702702283859253, "rewards/fixed_code_pass_all_test_reward/std": 0.4275789260864258, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.4873639549898543, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.048035869374871254, "learning_rate": 1.2178541114613847e-05, "loss": 0.0019, "num_tokens": 20655017.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 285.5, "completions/mean_terminated_length": 285.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.48754842280022137, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.07854152098298073, "learning_rate": 1.2172255021946802e-05, "loss": 0.0031, "num_tokens": 20663541.0, "reward": 1.625, "reward_std": 0.42243334650993347, "rewards/fixed_code_pass_all_test_reward/mean": 0.7500000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.1721639484167099, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.48773289061058844, "frac_reward_zero_std": 1.0, "grad_norm": 0.3359375, "kl": 0.06193385529331863, "learning_rate": 1.2165968028275277e-05, "loss": 0.0025, "num_tokens": 20668654.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 331.875, "completions/mean_terminated_length": 331.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4879173584209555, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.05510362610220909, "learning_rate": 1.2159680136206987e-05, "loss": 0.0022, "num_tokens": 20678421.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 371.0, "completions/mean_terminated_length": 371.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.48810182623132264, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.06777585693635046, "learning_rate": 1.2153391348350014e-05, "loss": 0.0027, "num_tokens": 20690045.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.4882862940416897, "frac_reward_zero_std": 1.0, "grad_norm": 0.208984375, "kl": 0.0477626696228981, "learning_rate": 1.2147101667312809e-05, "loss": 0.0019, "num_tokens": 20697366.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 178.125, "completions/mean_terminated_length": 178.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.4884707618520568, "frac_reward_zero_std": 1.0, "grad_norm": 0.486328125, "kl": 0.16009643487632275, "learning_rate": 1.2140811095704195e-05, "loss": 0.0064, "num_tokens": 20701719.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4886552296624239, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.05128324078395963, "learning_rate": 1.2134519636133369e-05, "loss": 0.0021, "num_tokens": 20710831.0, "reward": 1.003787875175476, "reward_std": 0.010713729076087475, "rewards/fixed_code_pass_all_test_reward/mean": 0.0037878789007663727, "rewards/fixed_code_pass_all_test_reward/std": 0.010713739320635796, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 327.125, "completions/mean_terminated_length": 327.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.488839697472791, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.05914975330233574, "learning_rate": 1.212822729120989e-05, "loss": 0.0024, "num_tokens": 20718568.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.48902416528315806, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.06846488174051046, "learning_rate": 1.2121934063543686e-05, "loss": 0.0027, "num_tokens": 20724502.0, "reward": 1.4255318641662598, "reward_std": 0.4168568253517151, "rewards/fixed_code_pass_all_test_reward/mean": 0.42553192377090454, "rewards/fixed_code_pass_all_test_reward/std": 0.41685688495635986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.4892086330935252, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.06108893221244216, "learning_rate": 1.2115639955745055e-05, "loss": 0.0024, "num_tokens": 20731284.0, "reward": 1.6195652484893799, "reward_std": 0.1483549028635025, "rewards/fixed_code_pass_all_test_reward/mean": 0.6195651888847351, "rewards/fixed_code_pass_all_test_reward/std": 0.1483549177646637, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.48939310090389226, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.10655739158391953, "learning_rate": 1.2109344970424655e-05, "loss": 0.0043, "num_tokens": 20739064.0, "reward": 1.931249976158142, "reward_std": 0.12229211628437042, "rewards/fixed_code_pass_all_test_reward/mean": 0.9312499761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.12229210883378983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 505.125, "completions/mean_terminated_length": 505.125, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.48957756871425934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.02828190999571234, "learning_rate": 1.2103049110193508e-05, "loss": 0.0011, "num_tokens": 20758457.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.48976203652462647, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0738303866237402, "learning_rate": 1.2096752377663003e-05, "loss": 0.003, "num_tokens": 20768413.0, "reward": 1.672965168952942, "reward_std": 0.45135095715522766, "rewards/fixed_code_pass_all_test_reward/mean": 0.6729651093482971, "rewards/fixed_code_pass_all_test_reward/std": 0.45135101675987244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.48994650433499354, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.05979805439710617, "learning_rate": 1.209045477544489e-05, "loss": 0.0024, "num_tokens": 20773124.0, "reward": 1.3977272510528564, "reward_std": 0.16070610284805298, "rewards/fixed_code_pass_all_test_reward/mean": 0.39772728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.16070608794689178, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.4901309721453606, "frac_reward_zero_std": 1.0, "grad_norm": 0.054931640625, "kl": 0.04649677325505763, "learning_rate": 1.2084156306151278e-05, "loss": 0.0019, "num_tokens": 20779464.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.49031543995572774, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.0484307836741209, "learning_rate": 1.2077856972394633e-05, "loss": 0.0019, "num_tokens": 20785509.0, "reward": 1.8365384340286255, "reward_std": 0.35474681854248047, "rewards/fixed_code_pass_all_test_reward/mean": 0.8365384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.35474681854248047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.4904999077660948, "frac_reward_zero_std": 1.0, "grad_norm": 0.052734375, "kl": 0.05530066508799791, "learning_rate": 1.2071556776787788e-05, "loss": 0.0022, "num_tokens": 20793578.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.4906843755764619, "frac_reward_zero_std": 0.0, "grad_norm": 8.25, "kl": 0.0847413525916636, "learning_rate": 1.2065255721943921e-05, "loss": 0.0034, "num_tokens": 20801477.0, "reward": 1.10546875, "reward_std": 0.08985587954521179, "rewards/fixed_code_pass_all_test_reward/mean": 0.10546875, "rewards/fixed_code_pass_all_test_reward/std": 0.08985587954521179, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 203.5, "completions/mean_terminated_length": 203.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.490868843386829, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.06930288299918175, "learning_rate": 1.2058953810476581e-05, "loss": 0.0028, "num_tokens": 20807153.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4910533111971961, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.073355364613235, "learning_rate": 1.2052651044999659e-05, "loss": 0.0029, "num_tokens": 20813898.0, "reward": 1.7604167461395264, "reward_std": 0.44418084621429443, "rewards/fixed_code_pass_all_test_reward/mean": 0.7604166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.44418084621429443, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 237.0, "completions/mean_terminated_length": 237.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.49123777900756316, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.09029515041038394, "learning_rate": 1.204634742812741e-05, "loss": 0.0036, "num_tokens": 20822394.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4914222468179303, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05131604732014239, "learning_rate": 1.2040042962474432e-05, "loss": 0.0021, "num_tokens": 20834105.0, "reward": 1.7142857313156128, "reward_std": 0.13955946266651154, "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.13955944776535034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.49160671462829736, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.1136559909209609, "learning_rate": 1.203373765065569e-05, "loss": 0.0045, "num_tokens": 20842054.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 169.875, "completions/mean_terminated_length": 169.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.49179118243866443, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.07699846243485808, "learning_rate": 1.2027431495286486e-05, "loss": 0.0031, "num_tokens": 20848149.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 310.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.49197565024903156, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.023115876712836325, "learning_rate": 1.2021124498982477e-05, "loss": 0.0009, "num_tokens": 20855945.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.49216011805939863, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.037785286316648126, "learning_rate": 1.2014816664359671e-05, "loss": 0.0015, "num_tokens": 20867042.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.4923445858697657, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.0595626593567431, "learning_rate": 1.2008507994034417e-05, "loss": 0.0024, "num_tokens": 20874000.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 481.625, "completions/mean_terminated_length": 481.625, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.49252905368013283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0260009765625, "kl": 0.02039892948232591, "learning_rate": 1.2002198490623422e-05, "loss": 0.0008, "num_tokens": 20886421.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.4927135214904999, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.03898887615650892, "learning_rate": 1.1995888156743727e-05, "loss": 0.0016, "num_tokens": 20891909.0, "reward": 1.5721153020858765, "reward_std": 0.4501514434814453, "rewards/fixed_code_pass_all_test_reward/mean": 0.697115421295166, "rewards/fixed_code_pass_all_test_reward/std": 0.2953839898109436, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 369.625, "completions/mean_terminated_length": 369.625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.492897989300867, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.05486913048662245, "learning_rate": 1.1989576995012724e-05, "loss": 0.0022, "num_tokens": 20898594.0, "reward": 1.9500000476837158, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.4930824571112341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.04290141724050045, "learning_rate": 1.1983265008048143e-05, "loss": 0.0017, "num_tokens": 20903094.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 480.875, "completions/mean_terminated_length": 480.875, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.4932669249216012, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.03806313616223633, "learning_rate": 1.1976952198468066e-05, "loss": 0.0015, "num_tokens": 20917941.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.49345139273196825, "frac_reward_zero_std": 1.0, "grad_norm": 0.042724609375, "kl": 0.023616835242137313, "learning_rate": 1.1970638568890906e-05, "loss": 0.0009, "num_tokens": 20925086.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.4936358605423354, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.04167644586414099, "learning_rate": 1.1964324121935423e-05, "loss": 0.0017, "num_tokens": 20933423.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 399.875, "completions/mean_terminated_length": 399.875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.49382032835270245, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.026422150433063507, "learning_rate": 1.1958008860220711e-05, "loss": 0.0011, "num_tokens": 20939598.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.4940047961630695, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.106956887524575, "learning_rate": 1.1951692786366202e-05, "loss": 0.0043, "num_tokens": 20952353.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.49418926397343665, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.11331632919609547, "learning_rate": 1.1945375902991673e-05, "loss": 0.0045, "num_tokens": 20961085.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.4943737317838037, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.038794966181740165, "learning_rate": 1.1939058212717225e-05, "loss": 0.0016, "num_tokens": 20967795.0, "reward": 1.477678656578064, "reward_std": 0.2850358784198761, "rewards/fixed_code_pass_all_test_reward/mean": 0.4776785671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.2850358784198761, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 308.25, "completions/mean_terminated_length": 308.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.4945581995941708, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.0522110708989203, "learning_rate": 1.1932739718163307e-05, "loss": 0.0021, "num_tokens": 20974789.0, "reward": 1.7678571939468384, "reward_std": 0.25253817439079285, "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.49474266740453793, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.03153468365781009, "learning_rate": 1.1926420421950687e-05, "loss": 0.0013, "num_tokens": 20982845.0, "reward": 1.2638888359069824, "reward_std": 0.10695496201515198, "rewards/fixed_code_pass_all_test_reward/mean": 0.2638888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.10695488750934601, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.494927135214905, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.06071028159931302, "learning_rate": 1.192010032670047e-05, "loss": 0.0024, "num_tokens": 20987089.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.4951116030252721, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.12237499072216451, "learning_rate": 1.1913779435034106e-05, "loss": 0.0049, "num_tokens": 20992842.0, "reward": 1.6496212482452393, "reward_std": 0.31253358721733093, "rewards/fixed_code_pass_all_test_reward/mean": 0.6496212482452393, "rewards/fixed_code_pass_all_test_reward/std": 0.3125336170196533, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 383.75, "completions/mean_terminated_length": 383.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.4952960708356392, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.06221833219751716, "learning_rate": 1.1907457749573355e-05, "loss": 0.0025, "num_tokens": 20999168.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4954805386460063, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.03897123225033283, "learning_rate": 1.190113527294032e-05, "loss": 0.0016, "num_tokens": 21006277.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.49566500645637335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.06409923592582345, "learning_rate": 1.1894812007757425e-05, "loss": 0.0026, "num_tokens": 21015592.0, "reward": 1.5176470279693604, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5176470875740051, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 318.5, "completions/mean_terminated_length": 318.5, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.4958494742667405, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.024229480302892625, "learning_rate": 1.1888487956647428e-05, "loss": 0.001, "num_tokens": 21022300.0, "reward": 1.7249999046325684, "reward_std": 0.3535533547401428, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2070196568965912, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.49603394207710755, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.03946744522545487, "learning_rate": 1.1882163122233404e-05, "loss": 0.0016, "num_tokens": 21027236.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 260.0, "completions/mean_terminated_length": 260.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4962184098874746, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.04029747168533504, "learning_rate": 1.1875837507138761e-05, "loss": 0.0016, "num_tokens": 21033956.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.49640287769784175, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0515853154938668, "learning_rate": 1.1869511113987227e-05, "loss": 0.0021, "num_tokens": 21044337.0, "reward": 0.984375, "reward_std": 0.6211937665939331, "rewards/fixed_code_pass_all_test_reward/mean": 0.234375, "rewards/fixed_code_pass_all_test_reward/std": 0.19408094882965088, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 366.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.4965873455082088, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.07139591546729207, "learning_rate": 1.1863183945402855e-05, "loss": 0.0029, "num_tokens": 21056982.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.4967718133185759, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.045998001005500555, "learning_rate": 1.1856856004010013e-05, "loss": 0.0018, "num_tokens": 21061284.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.496956281128943, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "kl": 0.07007540948688984, "learning_rate": 1.1850527292433393e-05, "loss": 0.0028, "num_tokens": 21072580.0, "reward": 1.4583333730697632, "reward_std": 0.09155286848545074, "rewards/fixed_code_pass_all_test_reward/mean": 0.4583333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.09155285358428955, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.4971407489393101, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.06304393662139773, "learning_rate": 1.1844197813298018e-05, "loss": 0.0025, "num_tokens": 21081900.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 347.375, "completions/mean_terminated_length": 347.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.49732521674967717, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.03266024845652282, "learning_rate": 1.183786756922921e-05, "loss": 0.0013, "num_tokens": 21088431.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.4975096845600443, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.05862832581624389, "learning_rate": 1.1831536562852626e-05, "loss": 0.0023, "num_tokens": 21093084.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.49769415237041137, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.04328002501279116, "learning_rate": 1.1825204796794223e-05, "loss": 0.0017, "num_tokens": 21098784.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 504.25, "completions/mean_terminated_length": 504.25, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.49787862018077844, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.029938317951746285, "learning_rate": 1.1818872273680282e-05, "loss": 0.0012, "num_tokens": 21107826.0, "reward": 1.1124999523162842, "reward_std": 0.035355351865291595, "rewards/fixed_code_pass_all_test_reward/mean": 0.11249999701976776, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 300.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.49806308799114557, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.029234323417767882, "learning_rate": 1.1812538996137398e-05, "loss": 0.0012, "num_tokens": 21113906.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.49824755580151264, "frac_reward_zero_std": 1.0, "grad_norm": 0.33203125, "kl": 0.062060857424512506, "learning_rate": 1.180620496679248e-05, "loss": 0.0025, "num_tokens": 21117851.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4984320236118797, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.04966692649759352, "learning_rate": 1.1799870188272746e-05, "loss": 0.002, "num_tokens": 21126474.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 244.375, "completions/mean_terminated_length": 244.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.49861649142224684, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.07408269075676799, "learning_rate": 1.1793534663205721e-05, "loss": 0.003, "num_tokens": 21131325.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 297.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.4988009592326139, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.07318145502358675, "learning_rate": 1.178719839421925e-05, "loss": 0.0029, "num_tokens": 21137592.0, "reward": 1.7999999523162842, "reward_std": 0.21380898356437683, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.21380898356437683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.498985427042981, "frac_reward_zero_std": 0.0, "grad_norm": 3.953125, "kl": 0.12701131217181683, "learning_rate": 1.1780861383941472e-05, "loss": 0.0051, "num_tokens": 21145461.0, "reward": 1.768617033958435, "reward_std": 0.4302830994129181, "rewards/fixed_code_pass_all_test_reward/mean": 0.7686170339584351, "rewards/fixed_code_pass_all_test_reward/std": 0.4302831292152405, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4991698948533481, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.10169364977627993, "learning_rate": 1.1774523635000855e-05, "loss": 0.0041, "num_tokens": 21153315.0, "reward": 1.5912162065505981, "reward_std": 0.4677734076976776, "rewards/fixed_code_pass_all_test_reward/mean": 0.5912162065505981, "rewards/fixed_code_pass_all_test_reward/std": 0.4677734076976776, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 327.0, "completions/mean_terminated_length": 327.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.4993543626637152, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.07676458265632391, "learning_rate": 1.1768185150026148e-05, "loss": 0.0031, "num_tokens": 21159779.0, "reward": 1.125, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.49953883047408226, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.07351124985143542, "learning_rate": 1.1761845931646426e-05, "loss": 0.0029, "num_tokens": 21165452.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.4997232982844494, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.1009812937118113, "learning_rate": 1.1755505982491056e-05, "loss": 0.004, "num_tokens": 21177289.0, "reward": 1.0818965435028076, "reward_std": 0.01219149399548769, "rewards/fixed_code_pass_all_test_reward/mean": 0.08189655095338821, "rewards/fixed_code_pass_all_test_reward/std": 0.012191496789455414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 183.875, "completions/mean_terminated_length": 183.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.49990776609481646, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.07043427461758256, "learning_rate": 1.174916530518971e-05, "loss": 0.0028, "num_tokens": 21184600.0, "reward": 1.797727346420288, "reward_std": 0.3881889581680298, "rewards/fixed_code_pass_all_test_reward/mean": 0.7977272868156433, "rewards/fixed_code_pass_all_test_reward/std": 0.3881889581680298, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5000922339051835, "frac_reward_zero_std": 0.0, "grad_norm": 170.0, "kl": 12.302193319424987, "learning_rate": 1.174282390237237e-05, "loss": 0.4921, "num_tokens": 21190531.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5002767017155506, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0717957061715424, "learning_rate": 1.1736481776669307e-05, "loss": 0.0029, "num_tokens": 21199237.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.5004611695259177, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.046601345064118505, "learning_rate": 1.1730138930711102e-05, "loss": 0.0019, "num_tokens": 21204178.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 450.5, "completions/mean_terminated_length": 450.5, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.5006456373362849, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.0505878000985831, "learning_rate": 1.1723795367128625e-05, "loss": 0.002, "num_tokens": 21216438.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 138.375, "completions/mean_terminated_length": 138.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5008301051466519, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.059616144048050046, "learning_rate": 1.1717451088553056e-05, "loss": 0.0024, "num_tokens": 21223681.0, "reward": 1.8809523582458496, "reward_std": 0.22043336927890778, "rewards/fixed_code_pass_all_test_reward/mean": 0.8809523582458496, "rewards/fixed_code_pass_all_test_reward/std": 0.22043335437774658, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 197.0, "completions/mean_terminated_length": 197.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.501014572957019, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.07333249785006046, "learning_rate": 1.1711106097615863e-05, "loss": 0.0029, "num_tokens": 21227921.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 425.375, "completions/mean_terminated_length": 425.375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5011990407673861, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.02200914826244116, "learning_rate": 1.1704760396948807e-05, "loss": 0.0009, "num_tokens": 21235652.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 644.5, "completions/mean_terminated_length": 644.5, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.5013835085777532, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.035445502784568816, "learning_rate": 1.1698413989183951e-05, "loss": 0.0014, "num_tokens": 21253400.0, "reward": 1.4642856121063232, "reward_std": 0.4234386682510376, "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857015132904, "rewards/fixed_code_pass_all_test_reward/std": 0.4234386384487152, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.5015679763881202, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.07418782147578895, "learning_rate": 1.1692066876953652e-05, "loss": 0.003, "num_tokens": 21257650.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 474.0, "completions/mean_terminated_length": 474.0, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.5017524441984874, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.02285875694360584, "learning_rate": 1.1685719062890549e-05, "loss": 0.0009, "num_tokens": 21266346.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.5019369120088545, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.03489925852045417, "learning_rate": 1.1679370549627575e-05, "loss": 0.0014, "num_tokens": 21272004.0, "reward": 1.530172348022461, "reward_std": 0.6369484066963196, "rewards/fixed_code_pass_all_test_reward/mean": 0.6551724672317505, "rewards/fixed_code_pass_all_test_reward/std": 0.30579590797424316, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5021213798192216, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.037027356680482626, "learning_rate": 1.1673021339797967e-05, "loss": 0.0015, "num_tokens": 21280624.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.5023058476295886, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.07478623231872916, "learning_rate": 1.1666671436035231e-05, "loss": 0.003, "num_tokens": 21292050.0, "reward": 1.02173912525177, "reward_std": 0.061487533152103424, "rewards/fixed_code_pass_all_test_reward/mean": 0.021739130839705467, "rewards/fixed_code_pass_all_test_reward/std": 0.061487551778554916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5024903154399557, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.029627330601215363, "learning_rate": 1.1660320840973176e-05, "loss": 0.0012, "num_tokens": 21298887.0, "reward": 1.53125, "reward_std": 0.5077524185180664, "rewards/fixed_code_pass_all_test_reward/mean": 0.53125, "rewards/fixed_code_pass_all_test_reward/std": 0.5077524185180664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 166.125, "completions/mean_terminated_length": 166.125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.5026747832503228, "frac_reward_zero_std": 0.0, "grad_norm": 3.9375, "kl": 0.12178614130243659, "learning_rate": 1.1653969557245887e-05, "loss": 0.0049, "num_tokens": 21307136.0, "reward": 1.7941176891326904, "reward_std": 0.3812199831008911, "rewards/fixed_code_pass_all_test_reward/mean": 0.7941176891326904, "rewards/fixed_code_pass_all_test_reward/std": 0.3812200427055359, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.50285925106069, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.05378217762336135, "learning_rate": 1.164761758748774e-05, "loss": 0.0022, "num_tokens": 21311456.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.503043718871057, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.060771857388317585, "learning_rate": 1.16412649343334e-05, "loss": 0.0024, "num_tokens": 21317271.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.5032281866814241, "frac_reward_zero_std": 1.0, "grad_norm": 0.2333984375, "kl": 0.13127531670033932, "learning_rate": 1.1634911600417801e-05, "loss": 0.0053, "num_tokens": 21325310.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5034126544917912, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.027134069590829313, "learning_rate": 1.162855758837618e-05, "loss": 0.0011, "num_tokens": 21333615.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 211.625, "completions/mean_terminated_length": 211.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.5035971223021583, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.06685583340004086, "learning_rate": 1.1622202900844033e-05, "loss": 0.0027, "num_tokens": 21341860.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.5037815901125253, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.06317011336795986, "learning_rate": 1.1615847540457156e-05, "loss": 0.0025, "num_tokens": 21348467.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 178.5, "completions/mean_terminated_length": 178.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.5039660579228925, "frac_reward_zero_std": 1.0, "grad_norm": 0.474609375, "kl": 0.09103891369886696, "learning_rate": 1.160949150985161e-05, "loss": 0.0036, "num_tokens": 21355311.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5041505257332596, "frac_reward_zero_std": 1.0, "grad_norm": 0.25, "kl": 0.05366447055712342, "learning_rate": 1.1603134811663744e-05, "loss": 0.0021, "num_tokens": 21362526.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5043349935436267, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.03851398825645447, "learning_rate": 1.159677744853017e-05, "loss": 0.0015, "num_tokens": 21366930.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.5045194613539937, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.044073982862755656, "learning_rate": 1.1590419423087792e-05, "loss": 0.0018, "num_tokens": 21372820.0, "reward": 1.1399998664855957, "reward_std": 0.056568559259176254, "rewards/fixed_code_pass_all_test_reward/mean": 0.14000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.05656854063272476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 350.5, "completions/mean_terminated_length": 350.5, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5047039291643608, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.06927605625241995, "learning_rate": 1.1584060737973786e-05, "loss": 0.0028, "num_tokens": 21380640.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 215.25, "completions/mean_terminated_length": 215.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.5048883969747279, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.07229752256534994, "learning_rate": 1.1577701395825586e-05, "loss": 0.0029, "num_tokens": 21389218.0, "reward": 1.3571429252624512, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 460.75, "completions/mean_terminated_length": 460.75, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.505072864785095, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.08270252589136362, "learning_rate": 1.157134139928092e-05, "loss": 0.0033, "num_tokens": 21403544.0, "reward": 1.9453125, "reward_std": 0.15467961132526398, "rewards/fixed_code_pass_all_test_reward/mean": 0.9453125, "rewards/fixed_code_pass_all_test_reward/std": 0.15467961132526398, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5052573325954621, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.06589836766943336, "learning_rate": 1.156498075097777e-05, "loss": 0.0026, "num_tokens": 21411793.0, "reward": 1.9583332538604736, "reward_std": 0.07715170830488205, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.07715168595314026, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5054418004058292, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.052351878490298986, "learning_rate": 1.15586194535544e-05, "loss": 0.0021, "num_tokens": 21419066.0, "reward": 1.4459459781646729, "reward_std": 0.4928794801235199, "rewards/fixed_code_pass_all_test_reward/mean": 0.5709459781646729, "rewards/fixed_code_pass_all_test_reward/std": 0.31862348318099976, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5056262682161963, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0751831317320466, "learning_rate": 1.1552257509649338e-05, "loss": 0.003, "num_tokens": 21428894.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.5058107360265633, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.08206547796726227, "learning_rate": 1.1545894921901377e-05, "loss": 0.0033, "num_tokens": 21438216.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 183.875, "completions/mean_terminated_length": 183.875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.5059952038369304, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.06702989852055907, "learning_rate": 1.1539531692949589e-05, "loss": 0.0027, "num_tokens": 21442751.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.5061796716472975, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.10885849315673113, "learning_rate": 1.1533167825433295e-05, "loss": 0.0044, "num_tokens": 21450816.0, "reward": 1.0441176891326904, "reward_std": 0.027230028063058853, "rewards/fixed_code_pass_all_test_reward/mean": 0.04411764815449715, "rewards/fixed_code_pass_all_test_reward/std": 0.027230003848671913, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5063641394576647, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.0657978467643261, "learning_rate": 1.1526803321992097e-05, "loss": 0.0026, "num_tokens": 21460508.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 317.875, "completions/mean_terminated_length": 317.875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5065486072680317, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.07555633597075939, "learning_rate": 1.1520438185265848e-05, "loss": 0.003, "num_tokens": 21466931.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 216.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5067330750783988, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.033716772915795445, "learning_rate": 1.1514072417894672e-05, "loss": 0.0013, "num_tokens": 21472835.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5069175428887659, "frac_reward_zero_std": 1.0, "grad_norm": 0.2314453125, "kl": 0.06910681305453181, "learning_rate": 1.1507706022518952e-05, "loss": 0.0028, "num_tokens": 21476860.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 267.125, "completions/mean_terminated_length": 267.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.507102010699133, "frac_reward_zero_std": 1.0, "grad_norm": 0.349609375, "kl": 0.0733543336391449, "learning_rate": 1.1501339001779333e-05, "loss": 0.0029, "num_tokens": 21481845.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.5072864785095, "frac_reward_zero_std": 0.0, "grad_norm": 4.03125, "kl": 0.1639840337447822, "learning_rate": 1.1494971358316715e-05, "loss": 0.0066, "num_tokens": 21485492.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 451.375, "completions/mean_terminated_length": 451.375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.5074709463198672, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.044893453596159816, "learning_rate": 1.1488603094772259e-05, "loss": 0.0018, "num_tokens": 21495031.0, "reward": 1.965517282485962, "reward_std": 0.0975319966673851, "rewards/fixed_code_pass_all_test_reward/mean": 0.9655172228813171, "rewards/fixed_code_pass_all_test_reward/std": 0.09753198176622391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 122.25, "completions/mean_terminated_length": 122.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.5076554141302343, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.07702865591272712, "learning_rate": 1.1482234213787385e-05, "loss": 0.0031, "num_tokens": 21501713.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 736.0, "completions/mean_terminated_length": 736.0, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.5078398819406014, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.04958224925212562, "learning_rate": 1.1475864718003764e-05, "loss": 0.002, "num_tokens": 21519457.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.5080243497509684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.051867969799786806, "learning_rate": 1.1469494610063328e-05, "loss": 0.0021, "num_tokens": 21529315.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5082088175613355, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.07677498692646623, "learning_rate": 1.146312389260826e-05, "loss": 0.0031, "num_tokens": 21536341.0, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5083932853717026, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.050492055248469114, "learning_rate": 1.1456752568280998e-05, "loss": 0.002, "num_tokens": 21545900.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.5085777531820698, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.08745591528713703, "learning_rate": 1.1450380639724229e-05, "loss": 0.0035, "num_tokens": 21554941.0, "reward": 1.9956395626068115, "reward_std": 0.012333251535892487, "rewards/fixed_code_pass_all_test_reward/mean": 0.9956395626068115, "rewards/fixed_code_pass_all_test_reward/std": 0.012333263643085957, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 199.375, "completions/mean_terminated_length": 199.375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5087622209924368, "frac_reward_zero_std": 1.0, "grad_norm": 0.2109375, "kl": 0.07249333849176764, "learning_rate": 1.1444008109580884e-05, "loss": 0.0029, "num_tokens": 21563360.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.5089466888028039, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.04247913626022637, "learning_rate": 1.143763498049416e-05, "loss": 0.0017, "num_tokens": 21574534.0, "reward": 1.899793267250061, "reward_std": 0.04201023653149605, "rewards/fixed_code_pass_all_test_reward/mean": 0.8997933864593506, "rewards/fixed_code_pass_all_test_reward/std": 0.042010217905044556, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 461.75, "completions/mean_terminated_length": 461.75, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.509131156613171, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.032761086826212704, "learning_rate": 1.1431261255107491e-05, "loss": 0.0013, "num_tokens": 21588628.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 234.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5093156244235381, "frac_reward_zero_std": 1.0, "grad_norm": 0.04638671875, "kl": 0.03936443873681128, "learning_rate": 1.1424886936064558e-05, "loss": 0.0016, "num_tokens": 21594876.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5095000922339051, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.06857722997665405, "learning_rate": 1.1418512026009294e-05, "loss": 0.0027, "num_tokens": 21598972.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 97.875, "completions/mean_terminated_length": 97.875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.5096845600442723, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.06426739948801696, "learning_rate": 1.1412136527585873e-05, "loss": 0.0026, "num_tokens": 21607211.0, "reward": 1.4724576473236084, "reward_std": 0.6200066208839417, "rewards/fixed_code_pass_all_test_reward/mean": 0.5974576473236084, "rewards/fixed_code_pass_all_test_reward/std": 0.343164324760437, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5098690278546394, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.06074325437657535, "learning_rate": 1.1405760443438713e-05, "loss": 0.0024, "num_tokens": 21615945.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 267.0, "completions/mean_terminated_length": 267.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5100534956650065, "frac_reward_zero_std": 1.0, "grad_norm": 0.486328125, "kl": 0.05336901335977018, "learning_rate": 1.1399383776212478e-05, "loss": 0.0021, "num_tokens": 21622665.0, "reward": 1.884615421295166, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8846153616905212, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5102379634753735, "frac_reward_zero_std": 1.0, "grad_norm": 0.1982421875, "kl": 0.06263949302956462, "learning_rate": 1.1393006528552071e-05, "loss": 0.0025, "num_tokens": 21631098.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5104224312857406, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.16667487774975598, "learning_rate": 1.1386628703102634e-05, "loss": 0.0067, "num_tokens": 21636430.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 446.125, "completions/mean_terminated_length": 446.125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.5106068990961077, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.0220942753367126, "learning_rate": 1.1380250302509557e-05, "loss": 0.0009, "num_tokens": 21644375.0, "reward": 1.920454502105713, "reward_std": 0.22498849034309387, "rewards/fixed_code_pass_all_test_reward/mean": 0.9204545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.22498852014541626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5107913669064749, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.05671435105614364, "learning_rate": 1.1373871329418456e-05, "loss": 0.0023, "num_tokens": 21651032.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.5109758347168419, "frac_reward_zero_std": 1.0, "grad_norm": 0.0380859375, "kl": 0.014651053003035486, "learning_rate": 1.1367491786475195e-05, "loss": 0.0006, "num_tokens": 21656517.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 538.0, "completions/mean_terminated_length": 322.2857360839844, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.511160302527209, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.06412857351824641, "learning_rate": 1.1361111676325871e-05, "loss": 0.0026, "num_tokens": 21668669.0, "reward": 1.1785714626312256, "reward_std": 0.6331466436386108, "rewards/fixed_code_pass_all_test_reward/mean": 0.3035714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.43490222096443176, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 589.375, "completions/mean_terminated_length": 589.375, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.5113447703375761, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.034458161098882556, "learning_rate": 1.1354731001616816e-05, "loss": 0.0014, "num_tokens": 21683752.0, "reward": 1.682692289352417, "reward_std": 0.37558069825172424, "rewards/fixed_code_pass_all_test_reward/mean": 0.682692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.37558072805404663, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 254.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5115292381479432, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.04000532370992005, "learning_rate": 1.1348349764994594e-05, "loss": 0.0016, "num_tokens": 21688943.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5117137059583102, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.07457591639831662, "learning_rate": 1.1341967969106006e-05, "loss": 0.003, "num_tokens": 21694143.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5118981737686774, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.051924890954978764, "learning_rate": 1.1335585616598086e-05, "loss": 0.0021, "num_tokens": 21699715.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 324.125, "completions/mean_terminated_length": 324.125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5120826415790445, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.06928668729960918, "learning_rate": 1.1329202710118088e-05, "loss": 0.0028, "num_tokens": 21707588.0, "reward": 1.4285714626312256, "reward_std": 0.47687023878097534, "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.47687023878097534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.5122671093894116, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.04577117180451751, "learning_rate": 1.132281925231351e-05, "loss": 0.0018, "num_tokens": 21712875.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 385.125, "completions/mean_terminated_length": 385.125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5124515771997786, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0404645586386323, "learning_rate": 1.1316435245832071e-05, "loss": 0.0016, "num_tokens": 21720012.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.5126360450101457, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.06661752332001925, "learning_rate": 1.1310050693321721e-05, "loss": 0.0027, "num_tokens": 21726225.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5128205128205128, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "kl": 0.0753330159932375, "learning_rate": 1.1303665597430631e-05, "loss": 0.003, "num_tokens": 21735755.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 141.625, "completions/mean_terminated_length": 141.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.51300498063088, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.0655957693234086, "learning_rate": 1.12972799608072e-05, "loss": 0.0026, "num_tokens": 21745024.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.513189448441247, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.05799030605703592, "learning_rate": 1.1290893786100058e-05, "loss": 0.0023, "num_tokens": 21753396.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 250.75, "completions/mean_terminated_length": 250.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5133739162516141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0498046875, "kl": 0.026142674265429378, "learning_rate": 1.1284507075958048e-05, "loss": 0.001, "num_tokens": 21760002.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5135583840619812, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.10257264226675034, "learning_rate": 1.127811983303024e-05, "loss": 0.0041, "num_tokens": 21770775.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5137428518723482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.06867937976494431, "learning_rate": 1.1271732059965925e-05, "loss": 0.0027, "num_tokens": 21778079.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.5139273196827153, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.06944828387349844, "learning_rate": 1.1265343759414614e-05, "loss": 0.0028, "num_tokens": 21784694.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 252.375, "completions/mean_terminated_length": 252.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.5141117874930825, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.07956474320963025, "learning_rate": 1.1258954934026032e-05, "loss": 0.0032, "num_tokens": 21789593.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.5142962553034496, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.10000539571046829, "learning_rate": 1.1252565586450133e-05, "loss": 0.004, "num_tokens": 21795190.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5144807231138167, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.05044715851545334, "learning_rate": 1.1246175719337073e-05, "loss": 0.002, "num_tokens": 21801504.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 110.5, "completions/mean_terminated_length": 110.5, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.5146651909241837, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.06457239296287298, "learning_rate": 1.1239785335337236e-05, "loss": 0.0026, "num_tokens": 21809492.0, "reward": 1.6302082538604736, "reward_std": 0.5105229616165161, "rewards/fixed_code_pass_all_test_reward/mean": 0.6302083134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.5105229616165161, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 293.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5148496587345508, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.04715607548132539, "learning_rate": 1.1233394437101212e-05, "loss": 0.0019, "num_tokens": 21816758.0, "reward": 1.9249999523162842, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5150341265449179, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.05243952805176377, "learning_rate": 1.1227003027279812e-05, "loss": 0.0021, "num_tokens": 21823074.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.515218594355285, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.029655779304448515, "learning_rate": 1.1220611108524054e-05, "loss": 0.0012, "num_tokens": 21832633.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 243.375, "completions/mean_terminated_length": 243.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.5154030621656521, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.0679078467655927, "learning_rate": 1.1214218683485159e-05, "loss": 0.0027, "num_tokens": 21841388.0, "reward": 1.7840909957885742, "reward_std": 0.2421196550130844, "rewards/fixed_code_pass_all_test_reward/mean": 0.7840908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.2421196699142456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 441.875, "completions/mean_terminated_length": 441.875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5155875299760192, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.030040468671359122, "learning_rate": 1.120782575481458e-05, "loss": 0.0012, "num_tokens": 21849171.0, "reward": 1.859375, "reward_std": 0.3499840497970581, "rewards/fixed_code_pass_all_test_reward/mean": 0.984375, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 185.5, "completions/mean_terminated_length": 185.5, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5157719977863863, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.06825133599340916, "learning_rate": 1.1201432325163955e-05, "loss": 0.0027, "num_tokens": 21853551.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 930.875, "completions/mean_terminated_length": 930.875, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.5159564655967533, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.03877966455183923, "learning_rate": 1.1195038397185148e-05, "loss": 0.0016, "num_tokens": 21873390.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.5161409334071204, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.059132230235263705, "learning_rate": 1.1188643973530217e-05, "loss": 0.0024, "num_tokens": 21880185.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5163254012174876, "frac_reward_zero_std": 1.0, "grad_norm": 0.059814453125, "kl": 0.0312650689156726, "learning_rate": 1.1182249056851435e-05, "loss": 0.0013, "num_tokens": 21888039.0, "reward": 1.0625, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 346.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5165098690278547, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.06655944208614528, "learning_rate": 1.1175853649801274e-05, "loss": 0.0027, "num_tokens": 21900487.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.5166943368382217, "frac_reward_zero_std": 1.0, "grad_norm": 0.75390625, "kl": 0.15797563968226314, "learning_rate": 1.1169457755032409e-05, "loss": 0.0063, "num_tokens": 21907953.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5168788046485888, "frac_reward_zero_std": 1.0, "grad_norm": 1.0703125, "kl": 0.11775696324184537, "learning_rate": 1.1163061375197721e-05, "loss": 0.0047, "num_tokens": 21916529.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.5170632724589559, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.06335880188271403, "learning_rate": 1.1156664512950287e-05, "loss": 0.0025, "num_tokens": 21927327.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.517247740269323, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.038034659810364246, "learning_rate": 1.1150267170943394e-05, "loss": 0.0015, "num_tokens": 21932791.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 473.375, "completions/mean_terminated_length": 473.375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5174322080796901, "frac_reward_zero_std": 1.0, "grad_norm": 0.057373046875, "kl": 0.03199424722697586, "learning_rate": 1.1143869351830514e-05, "loss": 0.0013, "num_tokens": 21942098.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.5176166758900572, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.05074069695547223, "learning_rate": 1.113747105826533e-05, "loss": 0.002, "num_tokens": 21947904.0, "reward": 1.8068182468414307, "reward_std": 0.08590098470449448, "rewards/fixed_code_pass_all_test_reward/mean": 0.8068181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.08590102195739746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5178011437004243, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.09624974802136421, "learning_rate": 1.1131072292901712e-05, "loss": 0.0038, "num_tokens": 21956449.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 174.5, "completions/mean_terminated_length": 174.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5179856115107914, "frac_reward_zero_std": 1.0, "grad_norm": 0.2216796875, "kl": 0.061480795964598656, "learning_rate": 1.1124673058393731e-05, "loss": 0.0025, "num_tokens": 21964813.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5181700793211584, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0848364932462573, "learning_rate": 1.1118273357395653e-05, "loss": 0.0034, "num_tokens": 21973781.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 335.875, "completions/mean_terminated_length": 335.875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5183545471315255, "frac_reward_zero_std": 1.0, "grad_norm": 0.04833984375, "kl": 0.02312794717727229, "learning_rate": 1.1111873192561933e-05, "loss": 0.0009, "num_tokens": 21981580.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 244.875, "completions/mean_terminated_length": 244.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.5185390149418926, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.08807057421654463, "learning_rate": 1.1105472566547222e-05, "loss": 0.0035, "num_tokens": 21986451.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 244.875, "completions/mean_terminated_length": 244.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5187234827522598, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.061100332997739315, "learning_rate": 1.1099071482006361e-05, "loss": 0.0024, "num_tokens": 21996882.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5189079505626268, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.09929554536938667, "learning_rate": 1.1092669941594386e-05, "loss": 0.004, "num_tokens": 22018910.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5190924183729939, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.04830296500585973, "learning_rate": 1.1086267947966509e-05, "loss": 0.0019, "num_tokens": 22023233.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.519276886183361, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.0883169169537723, "learning_rate": 1.1079865503778144e-05, "loss": 0.0035, "num_tokens": 22028720.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 214.75, "completions/mean_terminated_length": 214.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.5194613539937281, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.05145725037436932, "learning_rate": 1.1073462611684891e-05, "loss": 0.0021, "num_tokens": 22034558.0, "reward": 1.5916666984558105, "reward_std": 0.1649916023015976, "rewards/fixed_code_pass_all_test_reward/mean": 0.5916666984558105, "rewards/fixed_code_pass_all_test_reward/std": 0.1649915724992752, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5196458218040951, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.08933762554079294, "learning_rate": 1.1067059274342524e-05, "loss": 0.0036, "num_tokens": 22038816.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5198302896144623, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.04500139271840453, "learning_rate": 1.1060655494407013e-05, "loss": 0.0018, "num_tokens": 22047054.0, "reward": 1.5263158082962036, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5263158082962036, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.5200147574248294, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.07039323612116277, "learning_rate": 1.1054251274534506e-05, "loss": 0.0028, "num_tokens": 22052890.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.5201992252351965, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.07169554987922311, "learning_rate": 1.1047846617381336e-05, "loss": 0.0029, "num_tokens": 22060283.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5203836930455635, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.05748449871316552, "learning_rate": 1.1041441525604015e-05, "loss": 0.0023, "num_tokens": 22068881.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 236.75, "completions/mean_terminated_length": 236.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5205681608559306, "frac_reward_zero_std": 1.0, "grad_norm": 1.1796875, "kl": 0.08551336987875402, "learning_rate": 1.1035036001859241e-05, "loss": 0.0034, "num_tokens": 22078783.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.5207526286662977, "frac_reward_zero_std": 1.0, "grad_norm": 0.271484375, "kl": 0.07506263861432672, "learning_rate": 1.102863004880388e-05, "loss": 0.003, "num_tokens": 22085608.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5209370964766649, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.07959208462852985, "learning_rate": 1.1022223669094992e-05, "loss": 0.0032, "num_tokens": 22091914.0, "reward": 1.4107142686843872, "reward_std": 0.48857197165489197, "rewards/fixed_code_pass_all_test_reward/mean": 0.4107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.48857203125953674, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5211215642870319, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.06889720563776791, "learning_rate": 1.1015816865389797e-05, "loss": 0.0028, "num_tokens": 22099116.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 303.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.521306032097399, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.052699246909469366, "learning_rate": 1.1009409640345701e-05, "loss": 0.0021, "num_tokens": 22108068.0, "reward": 1.121212124824524, "reward_std": 0.5500050187110901, "rewards/fixed_code_pass_all_test_reward/mean": 0.24621212482452393, "rewards/fixed_code_pass_all_test_reward/std": 0.3273518979549408, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 515.375, "completions/mean_terminated_length": 515.375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.5214904999077661, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.027761294040828943, "learning_rate": 1.1003001996620284e-05, "loss": 0.0011, "num_tokens": 22118111.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.5216749677181332, "frac_reward_zero_std": 1.0, "grad_norm": 3.40625, "kl": 0.2936812271364033, "learning_rate": 1.0996593936871296e-05, "loss": 0.0117, "num_tokens": 22123194.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 262.125, "completions/mean_terminated_length": 262.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5218594355285002, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.04736586567014456, "learning_rate": 1.0990185463756664e-05, "loss": 0.0019, "num_tokens": 22128259.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5220439033388674, "frac_reward_zero_std": 1.0, "grad_norm": 0.057373046875, "kl": 0.036755701526999474, "learning_rate": 1.0983776579934483e-05, "loss": 0.0015, "num_tokens": 22135750.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 336.5, "completions/mean_terminated_length": 336.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5222283711492345, "frac_reward_zero_std": 1.0, "grad_norm": 0.05078125, "kl": 0.026326050399802625, "learning_rate": 1.0977367288063021e-05, "loss": 0.0011, "num_tokens": 22146362.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 362.375, "completions/mean_terminated_length": 362.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5224128389596016, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.05091599468141794, "learning_rate": 1.0970957590800713e-05, "loss": 0.002, "num_tokens": 22155061.0, "reward": 1.9711538553237915, "reward_std": 0.05341268330812454, "rewards/fixed_code_pass_all_test_reward/mean": 0.9711538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.05341270938515663, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 257.625, "completions/mean_terminated_length": 257.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5225973067699686, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.03149886429309845, "learning_rate": 1.0964547490806164e-05, "loss": 0.0013, "num_tokens": 22161554.0, "reward": 1.067307710647583, "reward_std": 0.027196446433663368, "rewards/fixed_code_pass_all_test_reward/mean": 0.06730769574642181, "rewards/fixed_code_pass_all_test_reward/std": 0.02719641663134098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5227817745803357, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.044260552851483226, "learning_rate": 1.0958136990738143e-05, "loss": 0.0018, "num_tokens": 22167443.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.5229662423907028, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.09342910349369049, "learning_rate": 1.0951726093255586e-05, "loss": 0.0037, "num_tokens": 22171348.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 196.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.52315071020107, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.057230569422245026, "learning_rate": 1.0945314801017601e-05, "loss": 0.0023, "num_tokens": 22176106.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.523335178011437, "frac_reward_zero_std": 1.0, "grad_norm": 0.158203125, "kl": 0.06981717702001333, "learning_rate": 1.0938903116683447e-05, "loss": 0.0028, "num_tokens": 22180150.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.5235196458218041, "frac_reward_zero_std": 1.0, "grad_norm": 10.5625, "kl": 0.3724119374528527, "learning_rate": 1.0932491042912557e-05, "loss": 0.0149, "num_tokens": 22186195.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 235.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5237041136321712, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.07150500360876322, "learning_rate": 1.0926078582364515e-05, "loss": 0.0029, "num_tokens": 22193818.0, "reward": 1.2352941036224365, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.23529411852359772, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.5238885814425382, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.02874091942794621, "learning_rate": 1.0919665737699077e-05, "loss": 0.0011, "num_tokens": 22200057.0, "reward": 1.5568182468414307, "reward_std": 0.22498852014541626, "rewards/fixed_code_pass_all_test_reward/mean": 0.5568181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.22498852014541626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.5240730492529053, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.09019806701689959, "learning_rate": 1.0913252511576151e-05, "loss": 0.0036, "num_tokens": 22210560.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5242575170632725, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.04106673173373565, "learning_rate": 1.09068389066558e-05, "loss": 0.0016, "num_tokens": 22219880.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 153.25, "completions/mean_terminated_length": 153.25, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5244419848736396, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.10602602735161781, "learning_rate": 1.0900424925598259e-05, "loss": 0.0042, "num_tokens": 22228034.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.5246264526840066, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.05191810172982514, "learning_rate": 1.0894010571063896e-05, "loss": 0.0021, "num_tokens": 22232525.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.5248109204943737, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.07341895636636764, "learning_rate": 1.0887595845713257e-05, "loss": 0.0029, "num_tokens": 22236595.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.5249953883047408, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.051474168663844466, "learning_rate": 1.0881180752207026e-05, "loss": 0.0021, "num_tokens": 22240446.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.5251798561151079, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.04622363508678973, "learning_rate": 1.0874765293206049e-05, "loss": 0.0018, "num_tokens": 22245101.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.525364323925475, "frac_reward_zero_std": 1.0, "grad_norm": 1.5859375, "kl": 0.0912965964525938, "learning_rate": 1.0868349471371316e-05, "loss": 0.0037, "num_tokens": 22251420.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 116.375, "completions/mean_terminated_length": 116.375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.5255487917358421, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.0451276577077806, "learning_rate": 1.0861933289363975e-05, "loss": 0.0018, "num_tokens": 22255135.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5257332595462092, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.04979930818080902, "learning_rate": 1.0855516749845317e-05, "loss": 0.002, "num_tokens": 22261873.0, "reward": 1.8321428298950195, "reward_std": 0.1299809217453003, "rewards/fixed_code_pass_all_test_reward/mean": 0.8321428298950195, "rewards/fixed_code_pass_all_test_reward/std": 0.1299809217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 360.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.5259177273565763, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0562515176134184, "learning_rate": 1.0849099855476786e-05, "loss": 0.0023, "num_tokens": 22272878.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.5261021951669433, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.06942415260709822, "learning_rate": 1.084268260891997e-05, "loss": 0.0028, "num_tokens": 22282381.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5262866629773104, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.07050213241018355, "learning_rate": 1.0836265012836602e-05, "loss": 0.0028, "num_tokens": 22293206.0, "reward": 1.5192307233810425, "reward_std": 0.39811471104621887, "rewards/fixed_code_pass_all_test_reward/mean": 0.5192307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.39811477065086365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 164.125, "completions/mean_terminated_length": 164.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5264711307876776, "frac_reward_zero_std": 1.0, "grad_norm": 0.2158203125, "kl": 0.07191826775670052, "learning_rate": 1.0829847069888566e-05, "loss": 0.0029, "num_tokens": 22303063.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.5266555985980447, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.061794571578502655, "learning_rate": 1.0823428782737885e-05, "loss": 0.0025, "num_tokens": 22309388.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5268400664084117, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0636354349553585, "learning_rate": 1.0817010154046728e-05, "loss": 0.0025, "num_tokens": 22320815.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5270245342187788, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.06486308155581355, "learning_rate": 1.0810591186477402e-05, "loss": 0.0026, "num_tokens": 22329593.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5272090020291459, "frac_reward_zero_std": 1.0, "grad_norm": 0.2412109375, "kl": 0.10514181852340698, "learning_rate": 1.0804171882692351e-05, "loss": 0.0042, "num_tokens": 22333577.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.527393469839513, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.05069387727417052, "learning_rate": 1.0797752245354175e-05, "loss": 0.002, "num_tokens": 22339904.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 275.375, "completions/mean_terminated_length": 275.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5275779376498801, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.08242975128814578, "learning_rate": 1.0791332277125587e-05, "loss": 0.0033, "num_tokens": 22348931.0, "reward": 1.9193549156188965, "reward_std": 0.12787501513957977, "rewards/fixed_code_pass_all_test_reward/mean": 0.9193548560142517, "rewards/fixed_code_pass_all_test_reward/std": 0.12787500023841858, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 506.25, "completions/mean_terminated_length": 506.25, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.5277624054602472, "frac_reward_zero_std": 1.0, "grad_norm": 0.054443359375, "kl": 0.027272063540294766, "learning_rate": 1.0784911980669463e-05, "loss": 0.0011, "num_tokens": 22356701.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5279468732706143, "frac_reward_zero_std": 1.0, "grad_norm": 0.91796875, "kl": 0.10435384907759726, "learning_rate": 1.0778491358648797e-05, "loss": 0.0042, "num_tokens": 22368224.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 201.75, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.5281313410809814, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "kl": 0.05660028988495469, "learning_rate": 1.0772070413726727e-05, "loss": 0.0023, "num_tokens": 22377590.0, "reward": 1.1488970518112183, "reward_std": 0.35034385323524475, "rewards/fixed_code_pass_all_test_reward/mean": 0.14889705181121826, "rewards/fixed_code_pass_all_test_reward/std": 0.35034388303756714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 306.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.5283158088913484, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.05581633443944156, "learning_rate": 1.076564914856652e-05, "loss": 0.0022, "num_tokens": 22389072.0, "reward": 1.2286585569381714, "reward_std": 0.11872920393943787, "rewards/fixed_code_pass_all_test_reward/mean": 0.228658527135849, "rewards/fixed_code_pass_all_test_reward/std": 0.11872921139001846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 330.375, "completions/mean_terminated_length": 330.375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.5285002767017155, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.03420646209269762, "learning_rate": 1.0759227565831582e-05, "loss": 0.0014, "num_tokens": 22399131.0, "reward": 1.9895833730697632, "reward_std": 0.008035284467041492, "rewards/fixed_code_pass_all_test_reward/mean": 0.9895833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.008035296574234962, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 306.875, "completions/mean_terminated_length": 306.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5286847445120827, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.056652861181646585, "learning_rate": 1.0752805668185442e-05, "loss": 0.0023, "num_tokens": 22408570.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.5288692123224498, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.034640196361579, "learning_rate": 1.0746383458291766e-05, "loss": 0.0014, "num_tokens": 22416077.0, "reward": 1.4738372564315796, "reward_std": 0.14546504616737366, "rewards/fixed_code_pass_all_test_reward/mean": 0.4738372266292572, "rewards/fixed_code_pass_all_test_reward/std": 0.14546507596969604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 254.625, "completions/mean_terminated_length": 254.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5290536801328168, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.06344530684873462, "learning_rate": 1.0739960938814354e-05, "loss": 0.0025, "num_tokens": 22425818.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 344.0, "completions/mean_terminated_length": 344.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5292381479431839, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.03709263319615275, "learning_rate": 1.0733538112417117e-05, "loss": 0.0015, "num_tokens": 22432394.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.529422615753551, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.033388080075383186, "learning_rate": 1.0727114981764115e-05, "loss": 0.0013, "num_tokens": 22437337.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5296070835639181, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.04971428005956113, "learning_rate": 1.0720691549519517e-05, "loss": 0.002, "num_tokens": 22445615.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5297915513742852, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.08263223338872194, "learning_rate": 1.071426781834763e-05, "loss": 0.0033, "num_tokens": 22450011.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.5299760191846523, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.1221967451274395, "learning_rate": 1.0707843790912871e-05, "loss": 0.0049, "num_tokens": 22456433.0, "reward": 1.6764706373214722, "reward_std": 0.2847239375114441, "rewards/fixed_code_pass_all_test_reward/mean": 0.6764705777168274, "rewards/fixed_code_pass_all_test_reward/std": 0.2847239375114441, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.5301604869950194, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.05089395050890744, "learning_rate": 1.0701419469879795e-05, "loss": 0.002, "num_tokens": 22461104.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5303449548053865, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.04488983517512679, "learning_rate": 1.069499485791307e-05, "loss": 0.0018, "num_tokens": 22465772.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 629.5, "completions/mean_terminated_length": 629.5, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.5305294226157535, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.028606810374185443, "learning_rate": 1.0688569957677483e-05, "loss": 0.0011, "num_tokens": 22475712.0, "reward": 1.4642857313156128, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5307138904261206, "frac_reward_zero_std": 0.0, "grad_norm": 3.359375, "kl": 0.050307599594816566, "learning_rate": 1.068214477183795e-05, "loss": 0.002, "num_tokens": 22483909.0, "reward": 1.545454502105713, "reward_std": 0.2915576994419098, "rewards/fixed_code_pass_all_test_reward/mean": 0.5454545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.2915577292442322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 464.25, "completions/mean_terminated_length": 464.25, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.5308983582364877, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.04617670155130327, "learning_rate": 1.0675719303059493e-05, "loss": 0.0018, "num_tokens": 22495023.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.5310828260468549, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.04782007681205869, "learning_rate": 1.0669293554007263e-05, "loss": 0.0019, "num_tokens": 22504583.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.5312672938572219, "frac_reward_zero_std": 1.0, "grad_norm": 0.52734375, "kl": 0.08962139952927828, "learning_rate": 1.0662867527346518e-05, "loss": 0.0036, "num_tokens": 22509175.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.531451761667589, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.03740406036376953, "learning_rate": 1.0656441225742638e-05, "loss": 0.0015, "num_tokens": 22514626.0, "reward": 1.274999976158142, "reward_std": 0.45276927947998047, "rewards/fixed_code_pass_all_test_reward/mean": 0.2750000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.45276927947998047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 260.25, "completions/mean_terminated_length": 260.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5316362294779561, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.05778325581923127, "learning_rate": 1.065001465186111e-05, "loss": 0.0023, "num_tokens": 22519772.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 524.25, "completions/mean_terminated_length": 524.25, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.5318206972883232, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.022832722985185683, "learning_rate": 1.0643587808367544e-05, "loss": 0.0009, "num_tokens": 22530038.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 378.875, "completions/mean_terminated_length": 378.875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.5320051650986902, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.14635428017936647, "learning_rate": 1.0637160697927651e-05, "loss": 0.0059, "num_tokens": 22538349.0, "reward": 1.7443182468414307, "reward_std": 0.4596707820892334, "rewards/fixed_code_pass_all_test_reward/mean": 0.8693181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.3516175448894501, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 131.875, "completions/mean_terminated_length": 131.875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.5321896329090574, "frac_reward_zero_std": 0.0, "grad_norm": 3.40625, "kl": 0.13630696991458535, "learning_rate": 1.063073332320726e-05, "loss": 0.0055, "num_tokens": 22546692.0, "reward": 1.6845238208770752, "reward_std": 0.45952996611595154, "rewards/fixed_code_pass_all_test_reward/mean": 0.8095238208770752, "rewards/fixed_code_pass_all_test_reward/std": 0.3749527633190155, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5323741007194245, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.05665122461505234, "learning_rate": 1.0624305686872305e-05, "loss": 0.0023, "num_tokens": 22553391.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5325585685297916, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.16702727181836963, "learning_rate": 1.0617877791588831e-05, "loss": 0.0067, "num_tokens": 22560242.0, "reward": 1.829545497894287, "reward_std": 0.35852745175361633, "rewards/fixed_code_pass_all_test_reward/mean": 0.8295454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.35852742195129395, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5327430363401586, "frac_reward_zero_std": 1.0, "grad_norm": 0.059814453125, "kl": 0.03078385430853814, "learning_rate": 1.0611449640022995e-05, "loss": 0.0012, "num_tokens": 22566516.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5329275041505257, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.07778582954779267, "learning_rate": 1.0605021234841049e-05, "loss": 0.0031, "num_tokens": 22576041.0, "reward": 1.875, "reward_std": 0.1725163608789444, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.17251639068126678, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 376.875, "completions/mean_terminated_length": 376.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5331119719608928, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.042972037685103714, "learning_rate": 1.0598592578709361e-05, "loss": 0.0017, "num_tokens": 22586304.0, "reward": 1.9924242496490479, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9924242496490479, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 113.5, "completions/mean_terminated_length": 113.5, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.53329643977126, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.09258627984672785, "learning_rate": 1.0592163674294395e-05, "loss": 0.0037, "num_tokens": 22590716.0, "reward": 1.0, "reward_std": 0.4522882401943207, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2094048261642456, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.533480907581627, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.023362795065622777, "learning_rate": 1.0585734524262727e-05, "loss": 0.0009, "num_tokens": 22598001.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5336653753919941, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.056577556766569614, "learning_rate": 1.0579305131281026e-05, "loss": 0.0023, "num_tokens": 22602657.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.5338498432023612, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.0577670366037637, "learning_rate": 1.0572875498016062e-05, "loss": 0.0023, "num_tokens": 22608079.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5340343110127282, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.07118513737805188, "learning_rate": 1.0566445627134718e-05, "loss": 0.0028, "num_tokens": 22617183.0, "reward": 1.4375, "reward_std": 0.3471825420856476, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 175.375, "completions/mean_terminated_length": 175.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5342187788230953, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.05093751079402864, "learning_rate": 1.0560015521303954e-05, "loss": 0.002, "num_tokens": 22621450.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5344032466334625, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.03199493215652183, "learning_rate": 1.055358518319085e-05, "loss": 0.0013, "num_tokens": 22631329.0, "reward": 1.8624999523162842, "reward_std": 0.34923386573791504, "rewards/fixed_code_pass_all_test_reward/mean": 0.862500011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.34923386573791504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5345877144438296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12109375, "kl": 0.0781529585365206, "learning_rate": 1.0547154615462563e-05, "loss": 0.0031, "num_tokens": 22640193.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 437.25, "completions/mean_terminated_length": 437.25, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.5347721822541966, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.03765420813579112, "learning_rate": 1.0540723820786358e-05, "loss": 0.0015, "num_tokens": 22647427.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5349566500645637, "frac_reward_zero_std": 1.0, "grad_norm": 0.412109375, "kl": 0.0720269801095128, "learning_rate": 1.0534292801829589e-05, "loss": 0.0029, "num_tokens": 22656175.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5351411178749308, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.061650448478758335, "learning_rate": 1.05278615612597e-05, "loss": 0.0025, "num_tokens": 22660437.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 502.25, "completions/mean_terminated_length": 502.25, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.5353255856852979, "frac_reward_zero_std": 0.0, "grad_norm": 0.76953125, "kl": 0.029154462041333318, "learning_rate": 1.0521430101744238e-05, "loss": 0.0012, "num_tokens": 22673599.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 344.375, "completions/mean_terminated_length": 344.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.535510053495665, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.037695730570703745, "learning_rate": 1.0514998425950826e-05, "loss": 0.0015, "num_tokens": 22683186.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5356945213060321, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.04198534006718546, "learning_rate": 1.0508566536547191e-05, "loss": 0.0017, "num_tokens": 22688841.0, "reward": 1.241666555404663, "reward_std": 0.3064129054546356, "rewards/fixed_code_pass_all_test_reward/mean": 0.24166667461395264, "rewards/fixed_code_pass_all_test_reward/std": 0.306412935256958, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 398.625, "completions/mean_terminated_length": 398.625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.5358789891163992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0537109375, "kl": 0.028222952503710985, "learning_rate": 1.0502134436201135e-05, "loss": 0.0011, "num_tokens": 22696454.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5360634569267663, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.09307905845344067, "learning_rate": 1.049570212758056e-05, "loss": 0.0037, "num_tokens": 22705385.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5362479247371333, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.11214547790586948, "learning_rate": 1.0489269613353445e-05, "loss": 0.0045, "num_tokens": 22714912.0, "reward": 1.7379032373428345, "reward_std": 0.3910844624042511, "rewards/fixed_code_pass_all_test_reward/mean": 0.8629032373428345, "rewards/fixed_code_pass_all_test_reward/std": 0.25907063484191895, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.5364323925475004, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.03336155042052269, "learning_rate": 1.0482836896187862e-05, "loss": 0.0013, "num_tokens": 22723016.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.5366168603578676, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.1418874787632376, "learning_rate": 1.0476403978751961e-05, "loss": 0.0057, "num_tokens": 22730794.0, "reward": 1.4515306949615479, "reward_std": 0.4724123179912567, "rewards/fixed_code_pass_all_test_reward/mean": 0.4515306055545807, "rewards/fixed_code_pass_all_test_reward/std": 0.4724123179912567, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 625.75, "completions/mean_terminated_length": 625.75, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.5368013281682347, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.03234223369508982, "learning_rate": 1.0469970863713976e-05, "loss": 0.0013, "num_tokens": 22747016.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 532.25, "completions/mean_terminated_length": 532.25, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.5369857959786017, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.029410035582259297, "learning_rate": 1.0463537553742225e-05, "loss": 0.0012, "num_tokens": 22760402.0, "reward": 1.7361111640930176, "reward_std": 0.16732670366764069, "rewards/fixed_code_pass_all_test_reward/mean": 0.7361111640930176, "rewards/fixed_code_pass_all_test_reward/std": 0.16732673346996307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 415.25, "completions/mean_terminated_length": 415.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.5371702637889688, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.11432363820495084, "learning_rate": 1.0457104051505105e-05, "loss": 0.0046, "num_tokens": 22772364.0, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 319.75, "completions/mean_terminated_length": 319.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5373547315993359, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.05747246742248535, "learning_rate": 1.0450670359671099e-05, "loss": 0.0023, "num_tokens": 22778034.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 278.875, "completions/mean_terminated_length": 278.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.537539199409703, "frac_reward_zero_std": 1.0, "grad_norm": 0.050048828125, "kl": 0.020137262414209545, "learning_rate": 1.0444236480908752e-05, "loss": 0.0008, "num_tokens": 22783689.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 475.25, "completions/mean_terminated_length": 475.25, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.5377236672200701, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.040157190640456975, "learning_rate": 1.043780241788671e-05, "loss": 0.0016, "num_tokens": 22792923.0, "reward": 1.1944444179534912, "reward_std": 0.07856737822294235, "rewards/fixed_code_pass_all_test_reward/mean": 0.1944444477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.07856741547584534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 451.25, "completions/mean_terminated_length": 451.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5379081350304372, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.06845527281984687, "learning_rate": 1.0431368173273682e-05, "loss": 0.0027, "num_tokens": 22802997.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5380926028408043, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.06678146845661104, "learning_rate": 1.0424933749738446e-05, "loss": 0.0027, "num_tokens": 22812274.0, "reward": 1.942307710647583, "reward_std": 0.11446517705917358, "rewards/fixed_code_pass_all_test_reward/mean": 0.942307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.11446519196033478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 302.5, "completions/mean_terminated_length": 302.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5382770706511714, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.08439515298232436, "learning_rate": 1.041849914994987e-05, "loss": 0.0034, "num_tokens": 22819286.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.5384615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0698239584453404, "learning_rate": 1.041206437657688e-05, "loss": 0.0028, "num_tokens": 22826834.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.5386460062719055, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.14297528378665447, "learning_rate": 1.040562943228849e-05, "loss": 0.0057, "num_tokens": 22833440.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5388304740822727, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.07073347736150026, "learning_rate": 1.0399194319753764e-05, "loss": 0.0028, "num_tokens": 22841915.0, "reward": 1.7083333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 298.875, "completions/mean_terminated_length": 298.875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5390149418926398, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.03134589415276423, "learning_rate": 1.0392759041641859e-05, "loss": 0.0013, "num_tokens": 22850506.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.5391994097030068, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.048020597314462066, "learning_rate": 1.0386323600621981e-05, "loss": 0.0019, "num_tokens": 22857161.0, "reward": 1.9239130020141602, "reward_std": 0.1408856362104416, "rewards/fixed_code_pass_all_test_reward/mean": 0.9239130616188049, "rewards/fixed_code_pass_all_test_reward/std": 0.14088566601276398, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 322.5, "completions/mean_terminated_length": 322.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5393838775133739, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.06474730675108731, "learning_rate": 1.037988799936342e-05, "loss": 0.0026, "num_tokens": 22867301.0, "reward": 1.3333332538604736, "reward_std": 0.08904069662094116, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.08904072642326355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.539568345323741, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.03292603336740285, "learning_rate": 1.0373452240535519e-05, "loss": 0.0013, "num_tokens": 22871615.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.5397528131341081, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.06376901501789689, "learning_rate": 1.0367016326807691e-05, "loss": 0.0026, "num_tokens": 22879997.0, "reward": 1.8421052694320679, "reward_std": 0.3375931680202484, "rewards/fixed_code_pass_all_test_reward/mean": 0.8421052694320679, "rewards/fixed_code_pass_all_test_reward/std": 0.337593138217926, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 278.375, "completions/mean_terminated_length": 278.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.5399372809444752, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.05139678227715194, "learning_rate": 1.0360580260849418e-05, "loss": 0.0021, "num_tokens": 22889992.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5401217487548423, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.08663269691169262, "learning_rate": 1.0354144045330237e-05, "loss": 0.0035, "num_tokens": 22894914.0, "reward": 0.890625, "reward_std": 0.362515389919281, "rewards/fixed_code_pass_all_test_reward/mean": 0.015625, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 246.125, "completions/mean_terminated_length": 246.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5403062165652094, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.07935085985809565, "learning_rate": 1.0347707682919755e-05, "loss": 0.0032, "num_tokens": 22900883.0, "reward": 1.453125, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.453125, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5404906843755765, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.03251773677766323, "learning_rate": 1.0341271176287632e-05, "loss": 0.0013, "num_tokens": 22908148.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 412.625, "completions/mean_terminated_length": 412.625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.5406751521859435, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.04560885298997164, "learning_rate": 1.0334834528103597e-05, "loss": 0.0018, "num_tokens": 22917025.0, "reward": 1.59375, "reward_std": 0.6538007259368896, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.31160587072372437, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 314.125, "completions/mean_terminated_length": 314.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5408596199963106, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.08063866943120956, "learning_rate": 1.0328397741037427e-05, "loss": 0.0032, "num_tokens": 22926666.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 488.0, "completions/mean_terminated_length": 488.0, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5410440878066778, "frac_reward_zero_std": 1.0, "grad_norm": 0.037353515625, "kl": 0.034305014880374074, "learning_rate": 1.0321960817758971e-05, "loss": 0.0014, "num_tokens": 22937746.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5412285556170449, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.09129395592026412, "learning_rate": 1.0315523760938116e-05, "loss": 0.0037, "num_tokens": 22941844.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5414130234274119, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.03748956578783691, "learning_rate": 1.0309086573244818e-05, "loss": 0.0015, "num_tokens": 22948560.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 462.25, "completions/mean_terminated_length": 462.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.541597491237779, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.0413738387869671, "learning_rate": 1.0302649257349085e-05, "loss": 0.0017, "num_tokens": 22957450.0, "reward": 1.5, "reward_std": 0.7163236141204834, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.3949388265609741, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 312.75, "completions/mean_terminated_length": 312.75, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.5417819590481461, "frac_reward_zero_std": 1.0, "grad_norm": 0.0546875, "kl": 0.06339771300554276, "learning_rate": 1.0296211815920973e-05, "loss": 0.0025, "num_tokens": 22963832.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 885.75, "completions/mean_terminated_length": 885.75, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 0.5419664268585132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0341796875, "kl": 0.028716629603877664, "learning_rate": 1.0289774251630602e-05, "loss": 0.0011, "num_tokens": 22981454.0, "reward": 1.8333332538604736, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 288.0, "completions/mean_terminated_length": 288.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.5421508946688803, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.09394082147628069, "learning_rate": 1.0283336567148124e-05, "loss": 0.0038, "num_tokens": 22987710.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 387.125, "completions/mean_terminated_length": 387.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5423353624792474, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.041038677329197526, "learning_rate": 1.0276898765143756e-05, "loss": 0.0016, "num_tokens": 22995439.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 466.25, "completions/mean_terminated_length": 466.25, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.5425198302896145, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.044094409327954054, "learning_rate": 1.0270460848287764e-05, "loss": 0.0018, "num_tokens": 23003425.0, "reward": 1.9444444179534912, "reward_std": 0.05939141660928726, "rewards/fixed_code_pass_all_test_reward/mean": 0.9444444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.059391383081674576, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5427042980999816, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.05589689430780709, "learning_rate": 1.0264022819250448e-05, "loss": 0.0022, "num_tokens": 23013318.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5428887659103486, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0827323542907834, "learning_rate": 1.0257584680702171e-05, "loss": 0.0033, "num_tokens": 23023599.0, "reward": 1.9456522464752197, "reward_std": 0.15371885895729065, "rewards/fixed_code_pass_all_test_reward/mean": 0.945652186870575, "rewards/fixed_code_pass_all_test_reward/std": 0.15371887385845184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 467.125, "completions/mean_terminated_length": 467.125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.5430732337207157, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.02669946145033464, "learning_rate": 1.0251146435313328e-05, "loss": 0.0011, "num_tokens": 23037080.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 207.625, "completions/mean_terminated_length": 207.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.5432577015310828, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.05964621668681502, "learning_rate": 1.0244708085754368e-05, "loss": 0.0024, "num_tokens": 23044181.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 564.875, "completions/mean_terminated_length": 564.875, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.54344216934145, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.02920091792475432, "learning_rate": 1.0238269634695777e-05, "loss": 0.0012, "num_tokens": 23055228.0, "reward": 1.5625, "reward_std": 0.7081582546234131, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.42433422803878784, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 318.875, "completions/mean_terminated_length": 318.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.543626637151817, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.061435620533302426, "learning_rate": 1.023183108480809e-05, "loss": 0.0025, "num_tokens": 23065019.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5438111049621841, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.08609711239114404, "learning_rate": 1.0225392438761873e-05, "loss": 0.0034, "num_tokens": 23074401.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 346.25, "completions/mean_terminated_length": 346.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5439955727725512, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.042645737179555, "learning_rate": 1.0218953699227739e-05, "loss": 0.0017, "num_tokens": 23081611.0, "reward": 1.8189655542373657, "reward_std": 0.07314898073673248, "rewards/fixed_code_pass_all_test_reward/mean": 0.8189655542373657, "rewards/fixed_code_pass_all_test_reward/std": 0.07314898073673248, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 358.75, "completions/mean_terminated_length": 358.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.5441800405829182, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.08188384864479303, "learning_rate": 1.0212514868876337e-05, "loss": 0.0033, "num_tokens": 23091161.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5443645083932853, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.0773497389163822, "learning_rate": 1.020607595037836e-05, "loss": 0.0031, "num_tokens": 23100224.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 300.25, "completions/mean_terminated_length": 300.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5445489762036525, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.15689103538170457, "learning_rate": 1.0199636946404528e-05, "loss": 0.0063, "num_tokens": 23106770.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5447334440140196, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.07716801762580872, "learning_rate": 1.01931978596256e-05, "loss": 0.0031, "num_tokens": 23114810.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.5449179118243866, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.033491087262518704, "learning_rate": 1.0186758692712371e-05, "loss": 0.0013, "num_tokens": 23121469.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5451023796347537, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.06429042154923081, "learning_rate": 1.0180319448335669e-05, "loss": 0.0026, "num_tokens": 23131534.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5452868474451208, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.0394977442920208, "learning_rate": 1.0173880129166358e-05, "loss": 0.0016, "num_tokens": 23137161.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 357.875, "completions/mean_terminated_length": 357.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5454713152554879, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.04472412448376417, "learning_rate": 1.0167440737875323e-05, "loss": 0.0018, "num_tokens": 23148000.0, "reward": 1.4392857551574707, "reward_std": 0.3475654721260071, "rewards/fixed_code_pass_all_test_reward/mean": 0.5642857551574707, "rewards/fixed_code_pass_all_test_reward/std": 0.11903402209281921, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.545655783065855, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.10002595884725451, "learning_rate": 1.0161001277133482e-05, "loss": 0.004, "num_tokens": 23155319.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5458402508762221, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.053917341167107224, "learning_rate": 1.0154561749611791e-05, "loss": 0.0022, "num_tokens": 23159501.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5460247186865892, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.0825783796608448, "learning_rate": 1.0148122157981222e-05, "loss": 0.0033, "num_tokens": 23167463.0, "reward": 1.0049999952316284, "reward_std": 0.014142122119665146, "rewards/fixed_code_pass_all_test_reward/mean": 0.004999999888241291, "rewards/fixed_code_pass_all_test_reward/std": 0.01414213515818119, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 461.25, "completions/mean_terminated_length": 461.25, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.5462091864969563, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.038091919384896755, "learning_rate": 1.0141682504912781e-05, "loss": 0.0015, "num_tokens": 23175609.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5463936543073233, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.04536410956643522, "learning_rate": 1.0135242793077495e-05, "loss": 0.0018, "num_tokens": 23182538.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5465781221176904, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.06187007203698158, "learning_rate": 1.0128803025146423e-05, "loss": 0.0025, "num_tokens": 23190969.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.5467625899280576, "frac_reward_zero_std": 0.0, "grad_norm": 4.96875, "kl": 0.0872597610577941, "learning_rate": 1.0122363203790634e-05, "loss": 0.0035, "num_tokens": 23194764.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 131.75, "completions/mean_terminated_length": 131.75, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5469470577384247, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.053103921469300985, "learning_rate": 1.0115923331681231e-05, "loss": 0.0021, "num_tokens": 23199490.0, "reward": 1.298387050628662, "reward_std": 0.5246276259422302, "rewards/fixed_code_pass_all_test_reward/mean": 0.4233870804309845, "rewards/fixed_code_pass_all_test_reward/std": 0.17107422649860382, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 335.625, "completions/mean_terminated_length": 335.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.5471315255487917, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.06545190373435616, "learning_rate": 1.0109483411489333e-05, "loss": 0.0026, "num_tokens": 23209711.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.5473159933591588, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.08510353276506066, "learning_rate": 1.010304344588608e-05, "loss": 0.0034, "num_tokens": 23214190.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 1019.125, "completions/mean_terminated_length": 1019.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5475004611695259, "frac_reward_zero_std": 0.0, "grad_norm": 0.703125, "kl": 0.028750765486620367, "learning_rate": 1.0096603437542634e-05, "loss": 0.0012, "num_tokens": 23234959.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 303.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.547684928979893, "frac_reward_zero_std": 1.0, "grad_norm": 0.173828125, "kl": 0.07997561665251851, "learning_rate": 1.0090163389130162e-05, "loss": 0.0032, "num_tokens": 23241303.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.5478693967902601, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.07866302551701665, "learning_rate": 1.008372330331987e-05, "loss": 0.0031, "num_tokens": 23250578.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.5480538646006272, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.0510333813726902, "learning_rate": 1.0077283182782955e-05, "loss": 0.002, "num_tokens": 23258822.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.5482383324109943, "frac_reward_zero_std": 1.0, "grad_norm": 6.0, "kl": 0.2741277236491442, "learning_rate": 1.0070843030190647e-05, "loss": 0.011, "num_tokens": 23268573.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 314.625, "completions/mean_terminated_length": 314.625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5484228002213614, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.03558306361082941, "learning_rate": 1.0064402848214179e-05, "loss": 0.0014, "num_tokens": 23274218.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5486072680317284, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.07552179601043463, "learning_rate": 1.0057962639524799e-05, "loss": 0.003, "num_tokens": 23284166.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.5487917358420955, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.04716900666244328, "learning_rate": 1.005152240679377e-05, "loss": 0.0019, "num_tokens": 23290386.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.5489762036524627, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.06626921938732266, "learning_rate": 1.0045082152692356e-05, "loss": 0.0027, "num_tokens": 23299196.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.5491606714628298, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.06986137828789651, "learning_rate": 1.0038641879891842e-05, "loss": 0.0028, "num_tokens": 23303487.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 318.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5493451392731968, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.030925059923902154, "learning_rate": 1.0032201591063509e-05, "loss": 0.0012, "num_tokens": 23311072.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 575.0, "completions/mean_terminated_length": 575.0, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.5495296070835639, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.032945264829322696, "learning_rate": 1.0025761288878652e-05, "loss": 0.0013, "num_tokens": 23322512.0, "reward": 1.7708332538604736, "reward_std": 0.4266657829284668, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4266657531261444, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 327.0, "completions/mean_terminated_length": 327.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.549714074893931, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.052105620270594954, "learning_rate": 1.0019320976008566e-05, "loss": 0.0021, "num_tokens": 23333032.0, "reward": 1.6285715103149414, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6285714507102966, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 311.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.549898542704298, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.0826564347371459, "learning_rate": 1.0012880655124562e-05, "loss": 0.0033, "num_tokens": 23340032.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 356.875, "completions/mean_terminated_length": 356.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5500830105146652, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.02429720910731703, "learning_rate": 1.0006440328897936e-05, "loss": 0.001, "num_tokens": 23351695.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5502674783250323, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.0638869283720851, "learning_rate": 1e-05, "loss": 0.0026, "num_tokens": 23357682.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 525.0, "completions/mean_terminated_length": 525.0, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.5504519461353994, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.057589507196098566, "learning_rate": 9.99355967110207e-06, "loss": 0.0023, "num_tokens": 23368090.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 275.625, "completions/mean_terminated_length": 275.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.5506364139457665, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.049899716628715396, "learning_rate": 9.987119344875443e-06, "loss": 0.002, "num_tokens": 23373991.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 123.375, "completions/mean_terminated_length": 123.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.5508208817561335, "frac_reward_zero_std": 1.0, "grad_norm": 0.34765625, "kl": 0.10852334881201386, "learning_rate": 9.980679023991436e-06, "loss": 0.0043, "num_tokens": 23377818.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 390.625, "completions/mean_terminated_length": 390.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5510053495665006, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0786951226182282, "learning_rate": 9.974238711121352e-06, "loss": 0.0031, "num_tokens": 23387655.0, "reward": 1.533046007156372, "reward_std": 0.4037710726261139, "rewards/fixed_code_pass_all_test_reward/mean": 0.6580460071563721, "rewards/fixed_code_pass_all_test_reward/std": 0.25195759534835815, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5511898173768678, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.04208669392392039, "learning_rate": 9.967798408936495e-06, "loss": 0.0017, "num_tokens": 23395804.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.5513742851872349, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.11949729546904564, "learning_rate": 9.961358120108162e-06, "loss": 0.0048, "num_tokens": 23404743.0, "reward": 1.8154761791229248, "reward_std": 0.36923956871032715, "rewards/fixed_code_pass_all_test_reward/mean": 0.9404761791229248, "rewards/fixed_code_pass_all_test_reward/std": 0.16835874319076538, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 162.25, "completions/mean_terminated_length": 162.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5515587529976019, "frac_reward_zero_std": 1.0, "grad_norm": 0.20703125, "kl": 0.07909550401382148, "learning_rate": 9.954917847307647e-06, "loss": 0.0032, "num_tokens": 23408769.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.551743220807969, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.0548078753054142, "learning_rate": 9.948477593206233e-06, "loss": 0.0022, "num_tokens": 23413197.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5519276886183361, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.09515662537887692, "learning_rate": 9.942037360475205e-06, "loss": 0.0038, "num_tokens": 23419446.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 329.875, "completions/mean_terminated_length": 329.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.5521121564287031, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.030920034390874207, "learning_rate": 9.935597151785824e-06, "loss": 0.0012, "num_tokens": 23427501.0, "reward": 1.8854167461395264, "reward_std": 0.3240906000137329, "rewards/fixed_code_pass_all_test_reward/mean": 0.8854166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 391.875, "completions/mean_terminated_length": 391.875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.5522966242390703, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.05099313473328948, "learning_rate": 9.929156969809358e-06, "loss": 0.002, "num_tokens": 23435108.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 189.625, "completions/mean_terminated_length": 189.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.5524810920494374, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.06319018569774926, "learning_rate": 9.92271681721705e-06, "loss": 0.0025, "num_tokens": 23442465.0, "reward": 1.8698979616165161, "reward_std": 0.3517819046974182, "rewards/fixed_code_pass_all_test_reward/mean": 0.9948979616165161, "rewards/fixed_code_pass_all_test_reward/std": 0.01443074457347393, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 208.5, "completions/mean_terminated_length": 208.5, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.5526655598598045, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.06402226025238633, "learning_rate": 9.916276696680135e-06, "loss": 0.0026, "num_tokens": 23447917.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 289.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5528500276701716, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.052135222824290395, "learning_rate": 9.909836610869841e-06, "loss": 0.0021, "num_tokens": 23453737.0, "reward": 1.6964285373687744, "reward_std": 0.3000243008136749, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.1266293227672577, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5530344954805386, "frac_reward_zero_std": 1.0, "grad_norm": 0.234375, "kl": 0.04157649795524776, "learning_rate": 9.903396562457371e-06, "loss": 0.0017, "num_tokens": 23459548.0, "reward": 1.808823585510254, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8088235259056091, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.5532189632909057, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.06422287551686168, "learning_rate": 9.896956554113924e-06, "loss": 0.0026, "num_tokens": 23471665.0, "reward": 1.640625, "reward_std": 0.19408094882965088, "rewards/fixed_code_pass_all_test_reward/mean": 0.640625, "rewards/fixed_code_pass_all_test_reward/std": 0.19408094882965088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 361.5, "completions/mean_terminated_length": 361.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5534034311012729, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.04762482224032283, "learning_rate": 9.89051658851067e-06, "loss": 0.0019, "num_tokens": 23479957.0, "reward": 1.53125, "reward_std": 0.5077524185180664, "rewards/fixed_code_pass_all_test_reward/mean": 0.53125, "rewards/fixed_code_pass_all_test_reward/std": 0.5077524185180664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.55358789891164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.0388400285737589, "learning_rate": 9.884076668318774e-06, "loss": 0.0016, "num_tokens": 23486634.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 327.5, "completions/mean_terminated_length": 327.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.553772366722007, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.06323436601087451, "learning_rate": 9.87763679620937e-06, "loss": 0.0025, "num_tokens": 23496774.0, "reward": 1.875, "reward_std": 0.17268884181976318, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.172688826918602, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.5539568345323741, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.08443675329908729, "learning_rate": 9.871196974853579e-06, "loss": 0.0034, "num_tokens": 23505129.0, "reward": 1.03125, "reward_std": 0.019287917762994766, "rewards/fixed_code_pass_all_test_reward/mean": 0.03125, "rewards/fixed_code_pass_all_test_reward/std": 0.019287919625639915, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5541413023427412, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.0779094249010086, "learning_rate": 9.864757206922505e-06, "loss": 0.0031, "num_tokens": 23511379.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5543257701531082, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.08037409139797091, "learning_rate": 9.858317495087222e-06, "loss": 0.0032, "num_tokens": 23520522.0, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 433.375, "completions/mean_terminated_length": 433.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5545102379634754, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.037936384323984385, "learning_rate": 9.85187784201878e-06, "loss": 0.0015, "num_tokens": 23532805.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5546947057738425, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.0627927714958787, "learning_rate": 9.845438250388212e-06, "loss": 0.0025, "num_tokens": 23537425.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.5548791735842096, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.03053699410520494, "learning_rate": 9.83899872286652e-06, "loss": 0.0012, "num_tokens": 23543014.0, "reward": 1.1538461446762085, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1538461595773697, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.5550636413945766, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.09543070802465081, "learning_rate": 9.832559262124682e-06, "loss": 0.0038, "num_tokens": 23546862.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5552481092049437, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.07387631665915251, "learning_rate": 9.826119870833644e-06, "loss": 0.003, "num_tokens": 23557946.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5554325770153108, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.04109498905017972, "learning_rate": 9.819680551664331e-06, "loss": 0.0016, "num_tokens": 23564400.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5556170448256779, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.08092844672501087, "learning_rate": 9.813241307287629e-06, "loss": 0.0032, "num_tokens": 23571740.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 415.625, "completions/mean_terminated_length": 415.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.555801512636045, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.09677864680998027, "learning_rate": 9.806802140374403e-06, "loss": 0.0039, "num_tokens": 23579593.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 214.0, "completions/mean_terminated_length": 214.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5559859804464121, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.04767852043733001, "learning_rate": 9.800363053595473e-06, "loss": 0.0019, "num_tokens": 23584113.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5561704482567792, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "kl": 0.028022630373016, "learning_rate": 9.793924049621643e-06, "loss": 0.0011, "num_tokens": 23590940.0, "reward": 1.615384578704834, "reward_std": 0.4111711084842682, "rewards/fixed_code_pass_all_test_reward/mean": 0.6153846383094788, "rewards/fixed_code_pass_all_test_reward/std": 0.41117116808891296, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.5563549160671463, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.09499848261475563, "learning_rate": 9.787485131123665e-06, "loss": 0.0038, "num_tokens": 23599323.0, "reward": 1.0317460298538208, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0317460335791111, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.5565393838775133, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.038232608465477824, "learning_rate": 9.781046300772264e-06, "loss": 0.0015, "num_tokens": 23605265.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 257.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.5567238516878804, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.05210935324430466, "learning_rate": 9.774607561238132e-06, "loss": 0.0021, "num_tokens": 23610144.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5569083194982476, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.045980810886248946, "learning_rate": 9.768168915191913e-06, "loss": 0.0018, "num_tokens": 23618379.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 216.75, "completions/mean_terminated_length": 216.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.5570927873086147, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0662982661742717, "learning_rate": 9.761730365304225e-06, "loss": 0.0027, "num_tokens": 23628009.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 440.0, "completions/mean_terminated_length": 440.0, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.5572772551189817, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.055862643639557064, "learning_rate": 9.755291914245635e-06, "loss": 0.0022, "num_tokens": 23638089.0, "reward": 1.6875, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.5574617229293488, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.080532297026366, "learning_rate": 9.748853564686675e-06, "loss": 0.0032, "num_tokens": 23643602.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5576461907397159, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.06667769281193614, "learning_rate": 9.742415319297832e-06, "loss": 0.0027, "num_tokens": 23648317.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 543.25, "completions/mean_terminated_length": 543.25, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.557830658550083, "frac_reward_zero_std": 1.0, "grad_norm": 0.039306640625, "kl": 0.02369022398488596, "learning_rate": 9.735977180749553e-06, "loss": 0.0009, "num_tokens": 23662847.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 361.375, "completions/mean_terminated_length": 361.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5580151263604501, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.06614408118184656, "learning_rate": 9.729539151712238e-06, "loss": 0.0026, "num_tokens": 23674098.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5581995941708172, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.07662745611742139, "learning_rate": 9.723101234856245e-06, "loss": 0.0031, "num_tokens": 23683632.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5583840619811843, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.05629041721113026, "learning_rate": 9.71666343285188e-06, "loss": 0.0023, "num_tokens": 23689829.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5585685297915514, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.06561880325898528, "learning_rate": 9.710225748369402e-06, "loss": 0.0026, "num_tokens": 23700033.0, "reward": 1.899999976158142, "reward_std": 0.10690455138683319, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.10690449178218842, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.5587529976019184, "frac_reward_zero_std": 1.0, "grad_norm": 0.1748046875, "kl": 0.10448156576603651, "learning_rate": 9.703788184079029e-06, "loss": 0.0042, "num_tokens": 23706327.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 343.0, "completions/mean_terminated_length": 343.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5589374654122855, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.04843891761265695, "learning_rate": 9.697350742650917e-06, "loss": 0.0019, "num_tokens": 23713727.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 181.375, "completions/mean_terminated_length": 181.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5591219332226527, "frac_reward_zero_std": 1.0, "grad_norm": 0.63671875, "kl": 0.10429562302306294, "learning_rate": 9.690913426755185e-06, "loss": 0.0042, "num_tokens": 23721794.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 387.875, "completions/mean_terminated_length": 387.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5593064010330198, "frac_reward_zero_std": 0.0, "grad_norm": 0.62109375, "kl": 0.06455493532121181, "learning_rate": 9.684476239061888e-06, "loss": 0.0026, "num_tokens": 23728921.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5594908688433868, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.048352137906476855, "learning_rate": 9.678039182241035e-06, "loss": 0.0019, "num_tokens": 23738815.0, "reward": 1.6458332538604736, "reward_std": 0.6692657470703125, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.32043495774269104, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 370.125, "completions/mean_terminated_length": 370.125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.5596753366537539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0284423828125, "kl": 0.011484276270493865, "learning_rate": 9.671602258962574e-06, "loss": 0.0005, "num_tokens": 23746216.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.559859804464121, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.06120908074080944, "learning_rate": 9.665165471896407e-06, "loss": 0.0024, "num_tokens": 23751717.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.560044272274488, "frac_reward_zero_std": 1.0, "grad_norm": 1.015625, "kl": 0.09091737563721836, "learning_rate": 9.658728823712373e-06, "loss": 0.0036, "num_tokens": 23756510.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 203.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5602287400848552, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.12170455139130354, "learning_rate": 9.65229231708025e-06, "loss": 0.0049, "num_tokens": 23764764.0, "reward": 1.9083333015441895, "reward_std": 0.2592725157737732, "rewards/fixed_code_pass_all_test_reward/mean": 0.9083333015441895, "rewards/fixed_code_pass_all_test_reward/std": 0.2592725157737732, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 379.0, "completions/mean_terminated_length": 379.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5604132078952223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.02899075555615127, "learning_rate": 9.64585595466977e-06, "loss": 0.0012, "num_tokens": 23771556.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5605976757055894, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.07379932631738484, "learning_rate": 9.639419739150587e-06, "loss": 0.003, "num_tokens": 23781942.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.5607821435159565, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.06019414047477767, "learning_rate": 9.632983673192314e-06, "loss": 0.0024, "num_tokens": 23785829.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 293.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5609666113263235, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.10131181869655848, "learning_rate": 9.626547759464486e-06, "loss": 0.0041, "num_tokens": 23796871.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 280.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.5611510791366906, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.07449939387151971, "learning_rate": 9.620112000636585e-06, "loss": 0.003, "num_tokens": 23803559.0, "reward": 1.057692289352417, "reward_std": 0.03560848906636238, "rewards/fixed_code_pass_all_test_reward/mean": 0.05769231170415878, "rewards/fixed_code_pass_all_test_reward/std": 0.03560846671462059, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5613355469470578, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.0450738666113466, "learning_rate": 9.613676399378019e-06, "loss": 0.0018, "num_tokens": 23815484.0, "reward": 1.7613636255264282, "reward_std": 0.3698734641075134, "rewards/fixed_code_pass_all_test_reward/mean": 0.8863636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.21041364967823029, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 435.625, "completions/mean_terminated_length": 435.625, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.5615200147574249, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.038478299509733915, "learning_rate": 9.607240958358141e-06, "loss": 0.0015, "num_tokens": 23825041.0, "reward": 1.7155171632766724, "reward_std": 0.3762753903865814, "rewards/fixed_code_pass_all_test_reward/mean": 0.7155172228813171, "rewards/fixed_code_pass_all_test_reward/std": 0.37627536058425903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5617044825677919, "frac_reward_zero_std": 1.0, "grad_norm": 0.05615234375, "kl": 0.05108348198700696, "learning_rate": 9.600805680246237e-06, "loss": 0.002, "num_tokens": 23832575.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 414.375, "completions/mean_terminated_length": 414.375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.561888950378159, "frac_reward_zero_std": 1.0, "grad_norm": 0.044189453125, "kl": 0.04368687281385064, "learning_rate": 9.594370567711512e-06, "loss": 0.0017, "num_tokens": 23841730.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 375.125, "completions/mean_terminated_length": 375.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5620734181885261, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.033537911833263934, "learning_rate": 9.58793562342312e-06, "loss": 0.0013, "num_tokens": 23849099.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.5622578859988931, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.07128589088097215, "learning_rate": 9.581500850050134e-06, "loss": 0.0029, "num_tokens": 23855378.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5624423538092603, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.07034411723725498, "learning_rate": 9.575066250261554e-06, "loss": 0.0028, "num_tokens": 23859802.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5626268216196274, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.051201453199610114, "learning_rate": 9.568631826726322e-06, "loss": 0.002, "num_tokens": 23867013.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.5628112894299945, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.20083571667782962, "learning_rate": 9.56219758211329e-06, "loss": 0.008, "num_tokens": 23874313.0, "reward": 1.7338709831237793, "reward_std": 0.37569618225097656, "rewards/fixed_code_pass_all_test_reward/mean": 0.7338709831237793, "rewards/fixed_code_pass_all_test_reward/std": 0.37569618225097656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.5629957572403615, "frac_reward_zero_std": 1.0, "grad_norm": 0.220703125, "kl": 0.05519722495228052, "learning_rate": 9.55576351909125e-06, "loss": 0.0022, "num_tokens": 23878406.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 417.5, "completions/mean_terminated_length": 417.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5631802250507286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0537109375, "kl": 0.047078351373784244, "learning_rate": 9.549329640328905e-06, "loss": 0.0019, "num_tokens": 23892386.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.5633646928610957, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.06855276389978826, "learning_rate": 9.542895948494898e-06, "loss": 0.0027, "num_tokens": 23900961.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 325.375, "completions/mean_terminated_length": 325.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.5635491606714629, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.04745907289907336, "learning_rate": 9.536462446257777e-06, "loss": 0.0019, "num_tokens": 23909692.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.56373362848183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.03207184583880007, "learning_rate": 9.530029136286027e-06, "loss": 0.0013, "num_tokens": 23914625.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.563918096292197, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.04175345797557384, "learning_rate": 9.52359602124804e-06, "loss": 0.0017, "num_tokens": 23920315.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 172.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.5641025641025641, "frac_reward_zero_std": 1.0, "grad_norm": 0.24609375, "kl": 0.06413948489353061, "learning_rate": 9.51716310381214e-06, "loss": 0.0026, "num_tokens": 23924772.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 306.125, "completions/mean_terminated_length": 306.125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.5642870319129312, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.032394174369983375, "learning_rate": 9.510730386646557e-06, "loss": 0.0013, "num_tokens": 23931197.0, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 508.375, "completions/mean_terminated_length": 508.375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.5644714997232982, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.03801239421591163, "learning_rate": 9.504297872419441e-06, "loss": 0.0015, "num_tokens": 23950480.0, "reward": 1.0282257795333862, "reward_std": 0.011404937133193016, "rewards/fixed_code_pass_all_test_reward/mean": 0.02822580561041832, "rewards/fixed_code_pass_all_test_reward/std": 0.011404948309063911, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5646559675336654, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.07098966650664806, "learning_rate": 9.497865563798867e-06, "loss": 0.0028, "num_tokens": 23958675.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5648404353440325, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.05833358853124082, "learning_rate": 9.491433463452812e-06, "loss": 0.0023, "num_tokens": 23966188.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5650249031543996, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.08410008158534765, "learning_rate": 9.485001574049176e-06, "loss": 0.0034, "num_tokens": 23975124.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.5652093709647666, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.08379625016823411, "learning_rate": 9.478569898255765e-06, "loss": 0.0034, "num_tokens": 23981859.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5653938387751337, "frac_reward_zero_std": 1.0, "grad_norm": 0.04931640625, "kl": 0.03385592647828162, "learning_rate": 9.472138438740303e-06, "loss": 0.0014, "num_tokens": 23989034.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5655783065855008, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.06533441669307649, "learning_rate": 9.465707198170414e-06, "loss": 0.0026, "num_tokens": 23997480.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.565762774395868, "frac_reward_zero_std": 1.0, "grad_norm": 0.251953125, "kl": 0.09715726831927896, "learning_rate": 9.459276179213644e-06, "loss": 0.0039, "num_tokens": 24006970.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 279.5, "completions/mean_terminated_length": 279.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.565947242206235, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.048136515310034156, "learning_rate": 9.45284538453744e-06, "loss": 0.0019, "num_tokens": 24016414.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 312.625, "completions/mean_terminated_length": 312.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5661317100166021, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.0588270666776225, "learning_rate": 9.446414816809153e-06, "loss": 0.0024, "num_tokens": 24025027.0, "reward": 1.8229166269302368, "reward_std": 0.3100099265575409, "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.3100099265575409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5663161778269692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.045409470330923796, "learning_rate": 9.439984478696048e-06, "loss": 0.0018, "num_tokens": 24037519.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5665006456373363, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.06128852581605315, "learning_rate": 9.433554372865286e-06, "loss": 0.0025, "num_tokens": 24045953.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 414.0, "completions/mean_terminated_length": 414.0, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.5666851134477033, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.060522597283124924, "learning_rate": 9.42712450198394e-06, "loss": 0.0024, "num_tokens": 24054281.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5668695812580705, "frac_reward_zero_std": 1.0, "grad_norm": 0.043701171875, "kl": 0.03673248062841594, "learning_rate": 9.420694868718978e-06, "loss": 0.0015, "num_tokens": 24061557.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5670540490684376, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.05765954311937094, "learning_rate": 9.414265475737278e-06, "loss": 0.0023, "num_tokens": 24072650.0, "reward": 1.370192289352417, "reward_std": 0.1495802402496338, "rewards/fixed_code_pass_all_test_reward/mean": 0.3701923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.14958028495311737, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5672385168788047, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.06456700386479497, "learning_rate": 9.407836325705607e-06, "loss": 0.0026, "num_tokens": 24077797.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 286.75, "completions/mean_terminated_length": 286.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5674229846891717, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "kl": 0.12052298476919532, "learning_rate": 9.401407421290644e-06, "loss": 0.0048, "num_tokens": 24087043.0, "reward": 1.2549999952316284, "reward_std": 0.5603825449943542, "rewards/fixed_code_pass_all_test_reward/mean": 0.3799999952316284, "rewards/fixed_code_pass_all_test_reward/std": 0.2836496829986572, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 370.75, "completions/mean_terminated_length": 370.75, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5676074524995388, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.03635679604485631, "learning_rate": 9.394978765158955e-06, "loss": 0.0015, "num_tokens": 24095113.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 284.25, "completions/mean_terminated_length": 284.25, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.5677919203099059, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.044754753122106194, "learning_rate": 9.38855035997701e-06, "loss": 0.0018, "num_tokens": 24105611.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 417.875, "completions/mean_terminated_length": 417.875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.567976388120273, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.05407174653373659, "learning_rate": 9.382122208411174e-06, "loss": 0.0022, "num_tokens": 24115818.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5681608559306401, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.040209084982052445, "learning_rate": 9.375694313127702e-06, "loss": 0.0016, "num_tokens": 24124558.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5683453237410072, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.06220984098035842, "learning_rate": 9.369266676792748e-06, "loss": 0.0025, "num_tokens": 24133909.0, "reward": 1.75, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 428.625, "completions/mean_terminated_length": 428.625, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.5685297915513743, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.03674983815290034, "learning_rate": 9.362839302072354e-06, "loss": 0.0015, "num_tokens": 24143962.0, "reward": 1.0291666984558105, "reward_std": 0.04520673677325249, "rewards/fixed_code_pass_all_test_reward/mean": 0.02916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.04520675912499428, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.5687142593617414, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.06184509419836104, "learning_rate": 9.356412191632458e-06, "loss": 0.0025, "num_tokens": 24152266.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 246.375, "completions/mean_terminated_length": 246.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5688987271721084, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.07933429442346096, "learning_rate": 9.34998534813889e-06, "loss": 0.0032, "num_tokens": 24157141.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5690831949824755, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.036679193610325456, "learning_rate": 9.343558774257364e-06, "loss": 0.0015, "num_tokens": 24162584.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5692676627928427, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.056490780552849174, "learning_rate": 9.337132472653483e-06, "loss": 0.0023, "num_tokens": 24167258.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5694521306032098, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.07543063955381513, "learning_rate": 9.330706445992737e-06, "loss": 0.003, "num_tokens": 24176764.0, "reward": 1.2836538553237915, "reward_std": 0.4470895230770111, "rewards/fixed_code_pass_all_test_reward/mean": 0.2836538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.4470895528793335, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 338.375, "completions/mean_terminated_length": 338.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5696365984135768, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.026040896773338318, "learning_rate": 9.324280696940507e-06, "loss": 0.001, "num_tokens": 24184327.0, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 310.0, "completions/mean_terminated_length": 310.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.5698210662239439, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.03314630303066224, "learning_rate": 9.31785522816205e-06, "loss": 0.0013, "num_tokens": 24194039.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 223.75, "completions/mean_terminated_length": 223.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.570005534034311, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.08086887001991272, "learning_rate": 9.311430042322517e-06, "loss": 0.0032, "num_tokens": 24203749.0, "reward": 1.105769157409668, "reward_std": 0.07623317092657089, "rewards/fixed_code_pass_all_test_reward/mean": 0.10576923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.07623317092657089, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.570190001844678, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.04185581853380427, "learning_rate": 9.305005142086933e-06, "loss": 0.0017, "num_tokens": 24213370.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.5703744696550452, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.04427492921240628, "learning_rate": 9.298580530120206e-06, "loss": 0.0018, "num_tokens": 24218574.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 533.625, "completions/mean_terminated_length": 533.625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.5705589374654123, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.021625475317705423, "learning_rate": 9.29215620908713e-06, "loss": 0.0009, "num_tokens": 24229739.0, "reward": 1.2708333730697632, "reward_std": 0.12400398403406143, "rewards/fixed_code_pass_all_test_reward/mean": 0.2708333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.12400397658348083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5707434052757794, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.06780269369482994, "learning_rate": 9.285732181652374e-06, "loss": 0.0027, "num_tokens": 24236896.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.5709278730861465, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.06002924335189164, "learning_rate": 9.279308450480486e-06, "loss": 0.0024, "num_tokens": 24245222.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.5711123408965135, "frac_reward_zero_std": 1.0, "grad_norm": 0.11572265625, "kl": 0.05408326513133943, "learning_rate": 9.272885018235887e-06, "loss": 0.0022, "num_tokens": 24251084.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 354.125, "completions/mean_terminated_length": 354.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5712968087068806, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.09254475589841604, "learning_rate": 9.266461887582885e-06, "loss": 0.0037, "num_tokens": 24260589.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 303.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5714812765172478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.036447549238801, "learning_rate": 9.26003906118565e-06, "loss": 0.0015, "num_tokens": 24271042.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 205.75, "completions/mean_terminated_length": 205.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5716657443276149, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.04918173630721867, "learning_rate": 9.253616541708235e-06, "loss": 0.002, "num_tokens": 24275912.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5718502121379819, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.05809566378593445, "learning_rate": 9.247194331814561e-06, "loss": 0.0023, "num_tokens": 24281812.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.572034679948349, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.033518762211315334, "learning_rate": 9.240772434168421e-06, "loss": 0.0013, "num_tokens": 24290007.0, "reward": 1.2777777910232544, "reward_std": 0.0753851979970932, "rewards/fixed_code_pass_all_test_reward/mean": 0.2777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.07538522779941559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5722191477587161, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.020379821624374017, "learning_rate": 9.234350851433482e-06, "loss": 0.0008, "num_tokens": 24301595.0, "reward": 1.2727272510528564, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 504.125, "completions/mean_terminated_length": 504.125, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.5724036155690831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.03359130735043436, "learning_rate": 9.227929586273275e-06, "loss": 0.0013, "num_tokens": 24311420.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.5725880833794503, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.03396354021970183, "learning_rate": 9.221508641351206e-06, "loss": 0.0014, "num_tokens": 24318128.0, "reward": 1.8125, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 264.125, "completions/mean_terminated_length": 264.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.5727725511898174, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.07016346789896488, "learning_rate": 9.215088019330538e-06, "loss": 0.0028, "num_tokens": 24328081.0, "reward": 1.9090909957885742, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.5729570190001845, "frac_reward_zero_std": 1.0, "grad_norm": 0.18359375, "kl": 0.06919286260381341, "learning_rate": 9.208667722874414e-06, "loss": 0.0028, "num_tokens": 24332635.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.5731414868105515, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.07550233486108482, "learning_rate": 9.20224775464583e-06, "loss": 0.003, "num_tokens": 24342500.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 514.625, "completions/mean_terminated_length": 514.625, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.5733259546209186, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.028719107853248715, "learning_rate": 9.19582811730765e-06, "loss": 0.0011, "num_tokens": 24355769.0, "reward": 1.8611111640930176, "reward_std": 0.3501070737838745, "rewards/fixed_code_pass_all_test_reward/mean": 0.9861111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 201.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5735104224312857, "frac_reward_zero_std": 1.0, "grad_norm": 0.032470703125, "kl": 0.0221674139611423, "learning_rate": 9.189408813522602e-06, "loss": 0.0009, "num_tokens": 24364839.0, "reward": 1.6363636255264282, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5736948902416529, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.06800936837680638, "learning_rate": 9.182989845953275e-06, "loss": 0.0027, "num_tokens": 24373501.0, "reward": 1.5191175937652588, "reward_std": 0.43090853095054626, "rewards/fixed_code_pass_all_test_reward/mean": 0.5191176533699036, "rewards/fixed_code_pass_all_test_reward/std": 0.43090853095054626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.57387935805202, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.09679208509624004, "learning_rate": 9.176571217262118e-06, "loss": 0.0039, "num_tokens": 24382922.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 216.75, "completions/mean_terminated_length": 216.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.574063825862387, "frac_reward_zero_std": 1.0, "grad_norm": 0.166015625, "kl": 0.10754885897040367, "learning_rate": 9.170152930111437e-06, "loss": 0.0043, "num_tokens": 24391952.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5742482936727541, "frac_reward_zero_std": 1.0, "grad_norm": 0.361328125, "kl": 0.08281107386574149, "learning_rate": 9.163734987163403e-06, "loss": 0.0033, "num_tokens": 24397198.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5744327614831212, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.06581409554928541, "learning_rate": 9.157317391080036e-06, "loss": 0.0026, "num_tokens": 24403107.0, "reward": 1.548387050628662, "reward_std": 0.48279449343681335, "rewards/fixed_code_pass_all_test_reward/mean": 0.5483870506286621, "rewards/fixed_code_pass_all_test_reward/std": 0.48279452323913574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 432.75, "completions/mean_terminated_length": 432.75, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5746172292934882, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.06747109908610582, "learning_rate": 9.15090014452322e-06, "loss": 0.0027, "num_tokens": 24410961.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.5748016971038554, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.06242672586813569, "learning_rate": 9.144483250154688e-06, "loss": 0.0025, "num_tokens": 24421452.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5749861649142225, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.12354521919041872, "learning_rate": 9.138066710636031e-06, "loss": 0.0049, "num_tokens": 24429395.0, "reward": 1.6944444179534912, "reward_std": 0.4552263617515564, "rewards/fixed_code_pass_all_test_reward/mean": 0.8194444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.36581188440322876, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.5751706327245896, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.03467771445866674, "learning_rate": 9.131650528628688e-06, "loss": 0.0014, "num_tokens": 24437510.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 350.875, "completions/mean_terminated_length": 350.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.5753551005349566, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.04091808735392988, "learning_rate": 9.125234706793956e-06, "loss": 0.0016, "num_tokens": 24444941.0, "reward": 1.954545497894287, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9545454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 257.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5755395683453237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.02648094407049939, "learning_rate": 9.118819247792977e-06, "loss": 0.0011, "num_tokens": 24450548.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 342.25, "completions/mean_terminated_length": 342.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5757240361556908, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.06292327074334025, "learning_rate": 9.112404154286748e-06, "loss": 0.0025, "num_tokens": 24460590.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.575908503966058, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.10138398502022028, "learning_rate": 9.10598942893611e-06, "loss": 0.0041, "num_tokens": 24465555.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.576092971776425, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.055706510320305824, "learning_rate": 9.099575074401744e-06, "loss": 0.0022, "num_tokens": 24471230.0, "reward": 1.7794811725616455, "reward_std": 0.36464211344718933, "rewards/fixed_code_pass_all_test_reward/mean": 0.7794811725616455, "rewards/fixed_code_pass_all_test_reward/std": 0.3646421432495117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5762774395867921, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.02453088155016303, "learning_rate": 9.0931610933442e-06, "loss": 0.001, "num_tokens": 24479147.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 357.0, "completions/mean_terminated_length": 357.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.5764619073971592, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.04281882499344647, "learning_rate": 9.086747488423852e-06, "loss": 0.0017, "num_tokens": 24485187.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 323.875, "completions/mean_terminated_length": 323.875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5766463752075263, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.03227805020287633, "learning_rate": 9.080334262300925e-06, "loss": 0.0013, "num_tokens": 24492874.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 320.5, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5768308430178933, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.027861300157383084, "learning_rate": 9.073921417635485e-06, "loss": 0.0011, "num_tokens": 24500302.0, "reward": 0.9583333730697632, "reward_std": 0.4520675837993622, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022911310196, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5770153108282605, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.06531839072704315, "learning_rate": 9.067508957087444e-06, "loss": 0.0026, "num_tokens": 24508140.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5771997786386276, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.0650955094024539, "learning_rate": 9.061096883316555e-06, "loss": 0.0026, "num_tokens": 24519585.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5773842464489947, "frac_reward_zero_std": 1.0, "grad_norm": 0.2099609375, "kl": 0.051836152793839574, "learning_rate": 9.0546851989824e-06, "loss": 0.0021, "num_tokens": 24524583.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 360.0, "completions/mean_terminated_length": 360.0, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.5775687142593617, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.05801359424367547, "learning_rate": 9.048273906744415e-06, "loss": 0.0023, "num_tokens": 24531303.0, "reward": 1.53125, "reward_std": 0.1293872892856598, "rewards/fixed_code_pass_all_test_reward/mean": 0.53125, "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5777531820697288, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.10005026031285524, "learning_rate": 9.04186300926186e-06, "loss": 0.004, "num_tokens": 24539926.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 656.375, "completions/mean_terminated_length": 656.375, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 0.5779376498800959, "frac_reward_zero_std": 1.0, "grad_norm": 0.031005859375, "kl": 0.026826492452528328, "learning_rate": 9.03545250919384e-06, "loss": 0.0011, "num_tokens": 24556289.0, "reward": 1.185185194015503, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.18518517911434174, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5781221176904631, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.04252563719637692, "learning_rate": 9.02904240919929e-06, "loss": 0.0017, "num_tokens": 24564094.0, "reward": 1.308333396911621, "reward_std": 0.4334249198436737, "rewards/fixed_code_pass_all_test_reward/mean": 0.3083333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.4334249198436737, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.5783065855008301, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.10599593073129654, "learning_rate": 9.02263271193698e-06, "loss": 0.0042, "num_tokens": 24572208.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.5784910533111972, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.05992647306993604, "learning_rate": 9.016223420065519e-06, "loss": 0.0024, "num_tokens": 24583354.0, "reward": 1.4583333730697632, "reward_std": 0.2828895151615143, "rewards/fixed_code_pass_all_test_reward/mean": 0.4583333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.2828894853591919, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 304.875, "completions/mean_terminated_length": 304.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.5786755211215643, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.07222713192459196, "learning_rate": 9.009814536243338e-06, "loss": 0.0029, "num_tokens": 24590561.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 250.75, "completions/mean_terminated_length": 250.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5788599889319314, "frac_reward_zero_std": 0.0, "grad_norm": 7.6875, "kl": 0.41720869950950146, "learning_rate": 9.003406063128707e-06, "loss": 0.0167, "num_tokens": 24598943.0, "reward": 0.9158163070678711, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.040816325694322586, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 233.875, "completions/mean_terminated_length": 233.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.5790444567422984, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.05713837337680161, "learning_rate": 8.99699800337972e-06, "loss": 0.0023, "num_tokens": 24606758.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 376.125, "completions/mean_terminated_length": 376.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5792289245526656, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.05819526920095086, "learning_rate": 8.990590359654304e-06, "loss": 0.0023, "num_tokens": 24617431.0, "reward": 1.2604166269302368, "reward_std": 0.21564549207687378, "rewards/fixed_code_pass_all_test_reward/mean": 0.2604166567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.21564547717571259, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 460.75, "completions/mean_terminated_length": 460.75, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.5794133923630327, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.033185570035129786, "learning_rate": 8.984183134610206e-06, "loss": 0.0013, "num_tokens": 24627045.0, "reward": 1.892241358757019, "reward_std": 0.30478742718696594, "rewards/fixed_code_pass_all_test_reward/mean": 0.892241358757019, "rewards/fixed_code_pass_all_test_reward/std": 0.30478739738464355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.5795978601733998, "frac_reward_zero_std": 1.0, "grad_norm": 0.4453125, "kl": 0.08338387520052493, "learning_rate": 8.977776330905012e-06, "loss": 0.0033, "num_tokens": 24631011.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5797823279837668, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.08756185998208821, "learning_rate": 8.971369951196121e-06, "loss": 0.0035, "num_tokens": 24635217.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.5799667957941339, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.08852395322173834, "learning_rate": 8.96496399814076e-06, "loss": 0.0035, "num_tokens": 24657562.0, "reward": 1.8177711963653564, "reward_std": 0.20359432697296143, "rewards/fixed_code_pass_all_test_reward/mean": 0.8177710771560669, "rewards/fixed_code_pass_all_test_reward/std": 0.20359434187412262, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 452.625, "completions/mean_terminated_length": 452.625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.580151263604501, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.03906269231811166, "learning_rate": 8.958558474395987e-06, "loss": 0.0016, "num_tokens": 24665791.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.580335731414868, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.055351070244796574, "learning_rate": 8.952153382618665e-06, "loss": 0.0022, "num_tokens": 24670247.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.5805201992252352, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.05936416203621775, "learning_rate": 8.945748725465497e-06, "loss": 0.0024, "num_tokens": 24679928.0, "reward": 1.7231011390686035, "reward_std": 0.11188396066427231, "rewards/fixed_code_pass_all_test_reward/mean": 0.7231012582778931, "rewards/fixed_code_pass_all_test_reward/std": 0.11188399791717529, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 764.375, "completions/mean_terminated_length": 764.375, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "epoch": 0.5807046670356023, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.03449773939792067, "learning_rate": 8.93934450559299e-06, "loss": 0.0014, "num_tokens": 24695659.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.5808891348459694, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.1061032097786665, "learning_rate": 8.932940725657478e-06, "loss": 0.0042, "num_tokens": 24703844.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.5810736026563365, "frac_reward_zero_std": 1.0, "grad_norm": 5.1875, "kl": 0.35607985430397093, "learning_rate": 8.926537388315112e-06, "loss": 0.0142, "num_tokens": 24710219.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5812580704667035, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0521982591599226, "learning_rate": 8.920134496221858e-06, "loss": 0.0021, "num_tokens": 24718325.0, "reward": 1.75, "reward_std": 0.3162277936935425, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.3162277638912201, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 438.125, "completions/mean_terminated_length": 438.125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5814425382770706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0498046875, "kl": 0.0387335994746536, "learning_rate": 8.913732052033493e-06, "loss": 0.0015, "num_tokens": 24727302.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.5816270060874378, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.0788916191086173, "learning_rate": 8.907330058405621e-06, "loss": 0.0032, "num_tokens": 24731925.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 356.0, "completions/mean_terminated_length": 356.0, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5818114738978049, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.04759481339715421, "learning_rate": 8.900928517993644e-06, "loss": 0.0019, "num_tokens": 24745077.0, "reward": 1.4693877696990967, "reward_std": 0.464866042137146, "rewards/fixed_code_pass_all_test_reward/mean": 0.4693877398967743, "rewards/fixed_code_pass_all_test_reward/std": 0.464866042137146, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5819959417081719, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.04585387231782079, "learning_rate": 8.894527433452783e-06, "loss": 0.0018, "num_tokens": 24752548.0, "reward": 1.7083333730697632, "reward_std": 0.4520675241947174, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4520675837993622, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 324.0, "completions/mean_terminated_length": 324.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.582180409518539, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.04842725582420826, "learning_rate": 8.888126807438074e-06, "loss": 0.0019, "num_tokens": 24758324.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 251.0, "completions/mean_terminated_length": 251.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5823648773289061, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.058752588694915175, "learning_rate": 8.881726642604352e-06, "loss": 0.0024, "num_tokens": 24763508.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 418.0, "completions/mean_terminated_length": 418.0, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.5825493451392731, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.03883625450544059, "learning_rate": 8.875326941606274e-06, "loss": 0.0016, "num_tokens": 24772364.0, "reward": 1.5833333730697632, "reward_std": 0.2357023060321808, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 525.375, "completions/mean_terminated_length": 525.375, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.5827338129496403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0291748046875, "kl": 0.021793163032270968, "learning_rate": 8.868927707098291e-06, "loss": 0.0009, "num_tokens": 24785135.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.5829182807600074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.023796185036189854, "learning_rate": 8.862528941734675e-06, "loss": 0.001, "num_tokens": 24790997.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 259.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.5831027485703745, "frac_reward_zero_std": 0.0, "grad_norm": 4.46875, "kl": 0.17211366863921285, "learning_rate": 8.856130648169488e-06, "loss": 0.0069, "num_tokens": 24796656.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5832872163807415, "frac_reward_zero_std": 0.0, "grad_norm": 3.6875, "kl": 0.11956409830600023, "learning_rate": 8.849732829056611e-06, "loss": 0.0048, "num_tokens": 24800917.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.5834716841911086, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.042751993518322706, "learning_rate": 8.843335487049713e-06, "loss": 0.0017, "num_tokens": 24809461.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 172.375, "completions/mean_terminated_length": 172.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.5836561520014757, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.05720906285569072, "learning_rate": 8.836938624802282e-06, "loss": 0.0023, "num_tokens": 24815608.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5838406198118429, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.01910726143978536, "learning_rate": 8.830542244967593e-06, "loss": 0.0008, "num_tokens": 24823605.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.58402508762221, "frac_reward_zero_std": 1.0, "grad_norm": 0.068359375, "kl": 0.04295167955569923, "learning_rate": 8.82414635019873e-06, "loss": 0.0017, "num_tokens": 24827856.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 839.5, "completions/mean_terminated_length": 839.5, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "epoch": 0.584209555432577, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.05973434797488153, "learning_rate": 8.817750943148565e-06, "loss": 0.0024, "num_tokens": 24845116.0, "reward": 1.5, "reward_std": 0.4364357888698578, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.41547447443008423, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.5843940232429441, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.084270556923002, "learning_rate": 8.811356026469784e-06, "loss": 0.0034, "num_tokens": 24855509.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.5845784910533112, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.0710697122849524, "learning_rate": 8.804961602814854e-06, "loss": 0.0028, "num_tokens": 24859714.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5847629588636782, "frac_reward_zero_std": 1.0, "grad_norm": 0.376953125, "kl": 0.10274150292389095, "learning_rate": 8.798567674836047e-06, "loss": 0.0041, "num_tokens": 24868209.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5849474266740454, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.026294138515368104, "learning_rate": 8.792174245185422e-06, "loss": 0.0011, "num_tokens": 24876964.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.5851318944844125, "frac_reward_zero_std": 1.0, "grad_norm": 0.306640625, "kl": 0.05886426707729697, "learning_rate": 8.785781316514841e-06, "loss": 0.0024, "num_tokens": 24880833.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5853163622947796, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.11528771463781595, "learning_rate": 8.77938889147595e-06, "loss": 0.0046, "num_tokens": 24888786.0, "reward": 1.0399999618530273, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.03999999910593033, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 308.25, "completions/mean_terminated_length": 308.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5855008301051466, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.03145635686814785, "learning_rate": 8.77299697272019e-06, "loss": 0.0013, "num_tokens": 24900492.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 688.125, "completions/mean_terminated_length": 493.857177734375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.5856852979155137, "frac_reward_zero_std": 0.0, "grad_norm": 0.7890625, "kl": 0.03960421320516616, "learning_rate": 8.76660556289879e-06, "loss": 0.0016, "num_tokens": 24910861.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.5858697657258808, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.05435580341145396, "learning_rate": 8.760214664662765e-06, "loss": 0.0022, "num_tokens": 24917522.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 770.25, "completions/mean_terminated_length": 770.25, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 0.586054233536248, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.028289878275245428, "learning_rate": 8.753824280662929e-06, "loss": 0.0011, "num_tokens": 24933244.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.586238701346615, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.04190579941496253, "learning_rate": 8.74743441354987e-06, "loss": 0.0017, "num_tokens": 24941282.0, "reward": 1.587837815284729, "reward_std": 0.09555497020483017, "rewards/fixed_code_pass_all_test_reward/mean": 0.587837815284729, "rewards/fixed_code_pass_all_test_reward/std": 0.09555497020483017, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5864231691569821, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.061950956704095006, "learning_rate": 8.74104506597397e-06, "loss": 0.0025, "num_tokens": 24950586.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5866076369673492, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.04604525334434584, "learning_rate": 8.73465624058539e-06, "loss": 0.0018, "num_tokens": 24960987.0, "reward": 1.8685345649719238, "reward_std": 0.2707245647907257, "rewards/fixed_code_pass_all_test_reward/mean": 0.8685344457626343, "rewards/fixed_code_pass_all_test_reward/std": 0.2707245647907257, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 248.0, "completions/mean_terminated_length": 248.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.5867921047777163, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.04262489650864154, "learning_rate": 8.728267940034079e-06, "loss": 0.0017, "num_tokens": 24966115.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 409.875, "completions/mean_terminated_length": 409.875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.5869765725880833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0546875, "kl": 0.033372290316037834, "learning_rate": 8.721880166969761e-06, "loss": 0.0013, "num_tokens": 24977250.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5871610403984505, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.05150403222069144, "learning_rate": 8.715492924041955e-06, "loss": 0.0021, "num_tokens": 24985794.0, "reward": 1.6911765336990356, "reward_std": 0.4262169301509857, "rewards/fixed_code_pass_all_test_reward/mean": 0.6911765336990356, "rewards/fixed_code_pass_all_test_reward/std": 0.4262169897556305, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.5873455082088176, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.03991248516831547, "learning_rate": 8.709106213899943e-06, "loss": 0.0016, "num_tokens": 24994142.0, "reward": 1.2604167461395264, "reward_std": 0.3194659948348999, "rewards/fixed_code_pass_all_test_reward/mean": 0.3854166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.3240906000137329, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 469.625, "completions/mean_terminated_length": 469.625, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.5875299760191847, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.036333235213533044, "learning_rate": 8.702720039192801e-06, "loss": 0.0015, "num_tokens": 25002643.0, "reward": 1.9500000476837158, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 257.125, "completions/mean_terminated_length": 257.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.5877144438295517, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.04805705789476633, "learning_rate": 8.696334402569372e-06, "loss": 0.0019, "num_tokens": 25010828.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5878989116399188, "frac_reward_zero_std": 0.0, "grad_norm": 4.90625, "kl": 0.12153078685514629, "learning_rate": 8.68994930667828e-06, "loss": 0.0049, "num_tokens": 25017329.0, "reward": 1.8086735010147095, "reward_std": 0.2785305380821228, "rewards/fixed_code_pass_all_test_reward/mean": 0.8086735010147095, "rewards/fixed_code_pass_all_test_reward/std": 0.2785305678844452, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5880833794502859, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.04142124834470451, "learning_rate": 8.683564754167932e-06, "loss": 0.0017, "num_tokens": 25024367.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5882678472606531, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.065675527905114, "learning_rate": 8.677180747686492e-06, "loss": 0.0026, "num_tokens": 25031430.0, "reward": 1.9583333730697632, "reward_std": 0.04454349726438522, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.04454353079199791, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.5884523150710201, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.14802498184144497, "learning_rate": 8.670797289881915e-06, "loss": 0.0059, "num_tokens": 25040227.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 246.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5886367828813872, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.02887022460345179, "learning_rate": 8.664414383401918e-06, "loss": 0.0012, "num_tokens": 25045979.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 229.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5888212506917543, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.06541248084977269, "learning_rate": 8.658032030893995e-06, "loss": 0.0026, "num_tokens": 25051867.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.5890057185021214, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.047532339580357075, "learning_rate": 8.651650235005407e-06, "loss": 0.0019, "num_tokens": 25063314.0, "reward": 1.3203781843185425, "reward_std": 0.2652982771396637, "rewards/fixed_code_pass_all_test_reward/mean": 0.3203781843185425, "rewards/fixed_code_pass_all_test_reward/std": 0.26529833674430847, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5891901863124884, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.07631267793476582, "learning_rate": 8.645268998383187e-06, "loss": 0.0031, "num_tokens": 25070365.0, "reward": 1.9954545497894287, "reward_std": 0.012856475077569485, "rewards/fixed_code_pass_all_test_reward/mean": 0.9954545497894287, "rewards/fixed_code_pass_all_test_reward/std": 0.012856495566666126, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.5893746541228556, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.08464737003669143, "learning_rate": 8.638888323674134e-06, "loss": 0.0034, "num_tokens": 25078607.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 171.875, "completions/mean_terminated_length": 171.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5895591219332227, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.05199655727483332, "learning_rate": 8.632508213524808e-06, "loss": 0.0021, "num_tokens": 25085750.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 309.0, "completions/mean_terminated_length": 309.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5897435897435898, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.10856376774609089, "learning_rate": 8.62612867058155e-06, "loss": 0.0043, "num_tokens": 25096294.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5899280575539568, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.052681738045066595, "learning_rate": 8.61974969749045e-06, "loss": 0.0021, "num_tokens": 25102752.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 201.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5901125253643239, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.12214001500979066, "learning_rate": 8.613371296897371e-06, "loss": 0.0049, "num_tokens": 25110086.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.590296993174691, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.11049781227484345, "learning_rate": 8.606993471447934e-06, "loss": 0.0044, "num_tokens": 25117926.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 478.125, "completions/mean_terminated_length": 478.125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.5904814609850582, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.06073322496376932, "learning_rate": 8.600616223787527e-06, "loss": 0.0024, "num_tokens": 25127415.0, "reward": 1.8875000476837158, "reward_std": 0.2130187302827835, "rewards/fixed_code_pass_all_test_reward/mean": 0.887499988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.2130187600851059, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 508.875, "completions/mean_terminated_length": 508.875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.5906659287954252, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.038828458869829774, "learning_rate": 8.59423955656129e-06, "loss": 0.0016, "num_tokens": 25141990.0, "reward": 0.8958333134651184, "reward_std": 0.3649037480354309, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.046929534524679184, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.5908503966057923, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.06387629639357328, "learning_rate": 8.587863472414127e-06, "loss": 0.0026, "num_tokens": 25150509.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.5910348644161594, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.0498681862372905, "learning_rate": 8.581487973990706e-06, "loss": 0.002, "num_tokens": 25156458.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5912193322265265, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.0712782102636993, "learning_rate": 8.57511306393544e-06, "loss": 0.0029, "num_tokens": 25165846.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.5914038000368935, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.06684893975034356, "learning_rate": 8.56873874489251e-06, "loss": 0.0027, "num_tokens": 25175909.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 260.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5915882678472607, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.04217724839691073, "learning_rate": 8.562365019505841e-06, "loss": 0.0017, "num_tokens": 25181507.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5917727356576278, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.07314746221527457, "learning_rate": 8.555991890419116e-06, "loss": 0.0029, "num_tokens": 25189705.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5919572034679949, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.04876112728379667, "learning_rate": 8.549619360275776e-06, "loss": 0.002, "num_tokens": 25195377.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.5921416712783619, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.08647234225645661, "learning_rate": 8.543247431719003e-06, "loss": 0.0035, "num_tokens": 25204573.0, "reward": 1.7803030014038086, "reward_std": 0.40160703659057617, "rewards/fixed_code_pass_all_test_reward/mean": 0.7803030014038086, "rewards/fixed_code_pass_all_test_reward/std": 0.40160703659057617, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 371.0, "completions/mean_terminated_length": 371.0, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.592326139088729, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.03081541054416448, "learning_rate": 8.53687610739174e-06, "loss": 0.0012, "num_tokens": 25211181.0, "reward": 1.600000023841858, "reward_std": 0.6761233806610107, "rewards/fixed_code_pass_all_test_reward/mean": 0.7250000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5925106068990961, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.051936286967247725, "learning_rate": 8.530505389936672e-06, "loss": 0.0021, "num_tokens": 25216284.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5926950747094631, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.052119452971965075, "learning_rate": 8.52413528199624e-06, "loss": 0.0021, "num_tokens": 25221071.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 279.0, "completions/mean_terminated_length": 279.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.5928795425198303, "frac_reward_zero_std": 1.0, "grad_norm": 1.6640625, "kl": 0.10047052125446498, "learning_rate": 8.517765786212616e-06, "loss": 0.004, "num_tokens": 25227759.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 229.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.5930640103301974, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.03733651037327945, "learning_rate": 8.511396905227745e-06, "loss": 0.0015, "num_tokens": 25232615.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5932484781405645, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.07049525016918778, "learning_rate": 8.505028641683288e-06, "loss": 0.0028, "num_tokens": 25240529.0, "reward": 1.8166667222976685, "reward_std": 0.11315575242042542, "rewards/fixed_code_pass_all_test_reward/mean": 0.8166667222976685, "rewards/fixed_code_pass_all_test_reward/std": 0.1131557822227478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5934329459509315, "frac_reward_zero_std": 0.0, "grad_norm": 3.078125, "kl": 0.10388934193179011, "learning_rate": 8.498660998220669e-06, "loss": 0.0042, "num_tokens": 25244857.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5936174137612986, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.08806224958971143, "learning_rate": 8.49229397748105e-06, "loss": 0.0035, "num_tokens": 25257533.0, "reward": 1.0775861740112305, "reward_std": 0.015962397679686546, "rewards/fixed_code_pass_all_test_reward/mean": 0.07758620381355286, "rewards/fixed_code_pass_all_test_reward/std": 0.01596241630613804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 326.625, "completions/mean_terminated_length": 326.625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5938018815716657, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.06908301124349236, "learning_rate": 8.48592758210533e-06, "loss": 0.0028, "num_tokens": 25269362.0, "reward": 1.8169642686843872, "reward_std": 0.021044835448265076, "rewards/fixed_code_pass_all_test_reward/mean": 0.8169642686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.02104484848678112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5939863493820329, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.04887157795019448, "learning_rate": 8.479561814734155e-06, "loss": 0.002, "num_tokens": 25276706.0, "reward": 1.7083333730697632, "reward_std": 0.7000566720962524, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3563483655452728, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5941708171924, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.06683684606105089, "learning_rate": 8.473196678007907e-06, "loss": 0.0027, "num_tokens": 25281863.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.594355285002767, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.05977827054448426, "learning_rate": 8.466832174566708e-06, "loss": 0.0024, "num_tokens": 25288065.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5945397528131341, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.061566812451928854, "learning_rate": 8.460468307050415e-06, "loss": 0.0025, "num_tokens": 25299625.0, "reward": 1.1774194240570068, "reward_std": 0.24506427347660065, "rewards/fixed_code_pass_all_test_reward/mean": 0.17741934955120087, "rewards/fixed_code_pass_all_test_reward/std": 0.24506424367427826, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.5947242206235012, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.09404979133978486, "learning_rate": 8.454105078098624e-06, "loss": 0.0038, "num_tokens": 25306532.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5949086884338682, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.05805628327652812, "learning_rate": 8.447742490350665e-06, "loss": 0.0023, "num_tokens": 25316839.0, "reward": 1.8250000476837158, "reward_std": 0.11338935047388077, "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.11338934302330017, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.5950931562442354, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.076607555616647, "learning_rate": 8.441380546445603e-06, "loss": 0.0031, "num_tokens": 25325241.0, "reward": 1.7447916269302368, "reward_std": 0.7051530480384827, "rewards/fixed_code_pass_all_test_reward/mean": 0.8697916269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.3517512381076813, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.5952776240546025, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.10158417746424675, "learning_rate": 8.435019249022233e-06, "loss": 0.0041, "num_tokens": 25336907.0, "reward": 1.1504237651824951, "reward_std": 0.8194618821144104, "rewards/fixed_code_pass_all_test_reward/mean": 0.40042370557785034, "rewards/fixed_code_pass_all_test_reward/std": 0.47793152928352356, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5954620918649696, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.07089452166110277, "learning_rate": 8.428658600719084e-06, "loss": 0.0028, "num_tokens": 25345970.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 383.75, "completions/mean_terminated_length": 383.75, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5956465596753366, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.02850088015838992, "learning_rate": 8.422298604174417e-06, "loss": 0.0011, "num_tokens": 25357672.0, "reward": 1.2386363744735718, "reward_std": 0.09642363339662552, "rewards/fixed_code_pass_all_test_reward/mean": 0.23863637447357178, "rewards/fixed_code_pass_all_test_reward/std": 0.09642365574836731, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 327.875, "completions/mean_terminated_length": 327.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5958310274857037, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.07595439814031124, "learning_rate": 8.415939262026219e-06, "loss": 0.003, "num_tokens": 25364591.0, "reward": 1.2562499046325684, "reward_std": 0.14985109865665436, "rewards/fixed_code_pass_all_test_reward/mean": 0.2562500238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.14985112845897675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 318.25, "completions/mean_terminated_length": 318.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5960154952960708, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.06655702390708029, "learning_rate": 8.40958057691221e-06, "loss": 0.0027, "num_tokens": 25373545.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.596199963106438, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.07881126319989562, "learning_rate": 8.403222551469833e-06, "loss": 0.0032, "num_tokens": 25384226.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.596384430916805, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.04022028320468962, "learning_rate": 8.396865188336263e-06, "loss": 0.0016, "num_tokens": 25388413.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 311.25, "completions/mean_terminated_length": 311.25, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.5965688987271721, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.06723395083099604, "learning_rate": 8.390508490148392e-06, "loss": 0.0027, "num_tokens": 25394607.0, "reward": 1.828125, "reward_std": 0.35942351818084717, "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.5967533665375392, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.05367007455788553, "learning_rate": 8.384152459542849e-06, "loss": 0.0021, "num_tokens": 25403250.0, "reward": 1.1363636255264282, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.13636364042758942, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 535.5, "completions/mean_terminated_length": 535.5, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.5969378343479063, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.0613776717800647, "learning_rate": 8.377797099155969e-06, "loss": 0.0025, "num_tokens": 25419286.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 396.5, "completions/mean_terminated_length": 396.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.5971223021582733, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.05931693676393479, "learning_rate": 8.371442411623825e-06, "loss": 0.0024, "num_tokens": 25434170.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 319.25, "completions/mean_terminated_length": 319.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5973067699686405, "frac_reward_zero_std": 1.0, "grad_norm": 0.056396484375, "kl": 0.016838344163261354, "learning_rate": 8.365088399582204e-06, "loss": 0.0007, "num_tokens": 25442692.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 223.75, "completions/mean_terminated_length": 223.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5974912377790076, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.05937175382860005, "learning_rate": 8.358735065666606e-06, "loss": 0.0024, "num_tokens": 25448162.0, "reward": 1.3250000476837158, "reward_std": 0.2882624566555023, "rewards/fixed_code_pass_all_test_reward/mean": 0.32499998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.2882624566555023, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.5976757055893747, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.06637639715336263, "learning_rate": 8.352382412512264e-06, "loss": 0.0027, "num_tokens": 25453269.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 229.25, "completions/mean_terminated_length": 229.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5978601733997417, "frac_reward_zero_std": 1.0, "grad_norm": 0.2041015625, "kl": 0.04743851535022259, "learning_rate": 8.346030442754118e-06, "loss": 0.0019, "num_tokens": 25458079.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5980446412101088, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.09651560196653008, "learning_rate": 8.33967915902683e-06, "loss": 0.0039, "num_tokens": 25462619.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 445.875, "completions/mean_terminated_length": 445.875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.5982291090204759, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.02713770850095898, "learning_rate": 8.33332856396477e-06, "loss": 0.0011, "num_tokens": 25473386.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.5984135768308431, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.047607710817828774, "learning_rate": 8.326978660202034e-06, "loss": 0.0019, "num_tokens": 25477448.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.5985980446412101, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.06106300977990031, "learning_rate": 8.320629450372425e-06, "loss": 0.0024, "num_tokens": 25482118.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.5987825124515772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.05153792607598007, "learning_rate": 8.314280937109453e-06, "loss": 0.0021, "num_tokens": 25489872.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 111.625, "completions/mean_terminated_length": 111.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5989669802619443, "frac_reward_zero_std": 0.0, "grad_norm": 3.5625, "kl": 0.05958120711147785, "learning_rate": 8.30793312304635e-06, "loss": 0.0024, "num_tokens": 25493485.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 410.5, "completions/mean_terminated_length": 410.5, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.5991514480723114, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.048616938991472125, "learning_rate": 8.301586010816047e-06, "loss": 0.0019, "num_tokens": 25502145.0, "reward": 1.9732142686843872, "reward_std": 0.07576146721839905, "rewards/fixed_code_pass_all_test_reward/mean": 0.9732142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.07576145231723785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.5993359158826784, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.10278662340715528, "learning_rate": 8.295239603051193e-06, "loss": 0.0041, "num_tokens": 25508487.0, "reward": 0.8563829660415649, "reward_std": 0.7574459910392761, "rewards/fixed_code_pass_all_test_reward/mean": 0.23138296604156494, "rewards/fixed_code_pass_all_test_reward/std": 0.32793134450912476, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5995203836930456, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.05988399335183203, "learning_rate": 8.28889390238414e-06, "loss": 0.0024, "num_tokens": 25513604.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5997048515034127, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.09363026265054941, "learning_rate": 8.282548911446945e-06, "loss": 0.0037, "num_tokens": 25520104.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.5998893193137798, "frac_reward_zero_std": 1.0, "grad_norm": 0.43359375, "kl": 0.050114861922338605, "learning_rate": 8.276204632871378e-06, "loss": 0.002, "num_tokens": 25524909.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6000737871241468, "frac_reward_zero_std": 0.0, "grad_norm": 14.875, "kl": 0.11650070990435779, "learning_rate": 8.269861069288903e-06, "loss": 0.0047, "num_tokens": 25531363.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6002582549345139, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.052279898431152105, "learning_rate": 8.263518223330698e-06, "loss": 0.0021, "num_tokens": 25537537.0, "reward": 1.7154254913330078, "reward_std": 0.6998616456985474, "rewards/fixed_code_pass_all_test_reward/mean": 0.8404254913330078, "rewards/fixed_code_pass_all_test_reward/std": 0.3531072437763214, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 514.125, "completions/mean_terminated_length": 514.125, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.600442722744881, "frac_reward_zero_std": 1.0, "grad_norm": 0.046630859375, "kl": 0.023886391893029213, "learning_rate": 8.257176097627634e-06, "loss": 0.001, "num_tokens": 25553026.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 318.0, "completions/mean_terminated_length": 318.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6006271905552482, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.10899634333327413, "learning_rate": 8.250834694810293e-06, "loss": 0.0044, "num_tokens": 25563154.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6008116583656152, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.03053273633122444, "learning_rate": 8.244494017508948e-06, "loss": 0.0012, "num_tokens": 25568498.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.6009961261759823, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.10166245605796576, "learning_rate": 8.238154068353578e-06, "loss": 0.0041, "num_tokens": 25576498.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.6011805939863494, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.08216354437172413, "learning_rate": 8.231814849973855e-06, "loss": 0.0033, "num_tokens": 25585218.0, "reward": 1.7923729419708252, "reward_std": 0.0838940218091011, "rewards/fixed_code_pass_all_test_reward/mean": 0.7923728823661804, "rewards/fixed_code_pass_all_test_reward/std": 0.0838940218091011, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.6013650617967164, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.035118608735501766, "learning_rate": 8.225476364999147e-06, "loss": 0.0014, "num_tokens": 25589234.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 212.0, "completions/mean_terminated_length": 212.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.6015495296070835, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.05302830832079053, "learning_rate": 8.21913861605853e-06, "loss": 0.0021, "num_tokens": 25595106.0, "reward": 1.774999976158142, "reward_std": 0.4200340211391449, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.4200340509414673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.6017339974174507, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.06788261700421572, "learning_rate": 8.212801605780754e-06, "loss": 0.0027, "num_tokens": 25604066.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.6019184652278178, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.07884980225935578, "learning_rate": 8.206465336794282e-06, "loss": 0.0032, "num_tokens": 25609717.0, "reward": 1.7746710777282715, "reward_std": 0.22870713472366333, "rewards/fixed_code_pass_all_test_reward/mean": 0.7746710181236267, "rewards/fixed_code_pass_all_test_reward/std": 0.22870714962482452, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.6021029330381849, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.11600998044013977, "learning_rate": 8.200129811727256e-06, "loss": 0.0046, "num_tokens": 25618498.0, "reward": 0.9950000047683716, "reward_std": 0.40928512811660767, "rewards/fixed_code_pass_all_test_reward/mean": 0.24500000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.31455183029174805, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.6022874008485519, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.043330899672582746, "learning_rate": 8.193795033207523e-06, "loss": 0.0017, "num_tokens": 25627398.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.602471868658919, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.08904390665702522, "learning_rate": 8.187461003862603e-06, "loss": 0.0036, "num_tokens": 25638077.0, "reward": 1.5555555820465088, "reward_std": 0.705233633518219, "rewards/fixed_code_pass_all_test_reward/mean": 0.680555522441864, "rewards/fixed_code_pass_all_test_reward/std": 0.4217938482761383, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 346.375, "completions/mean_terminated_length": 346.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6026563364692861, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.030930305714718997, "learning_rate": 8.181127726319721e-06, "loss": 0.0012, "num_tokens": 25645496.0, "reward": 1.8229166269302368, "reward_std": 0.2651650607585907, "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 641.5, "completions/mean_terminated_length": 641.5, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.6028408042796533, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.025008522905409336, "learning_rate": 8.17479520320578e-06, "loss": 0.001, "num_tokens": 25657212.0, "reward": 0.9375, "reward_std": 0.4172614812850952, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6030252720900203, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "kl": 0.07691958639770746, "learning_rate": 8.168463437147379e-06, "loss": 0.0031, "num_tokens": 25665701.0, "reward": 1.9407894611358643, "reward_std": 0.16747267544269562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9407894611358643, "rewards/fixed_code_pass_all_test_reward/std": 0.16747266054153442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 471.125, "completions/mean_terminated_length": 471.125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.6032097399003874, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.04306876705959439, "learning_rate": 8.162132430770791e-06, "loss": 0.0017, "num_tokens": 25678078.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6033942077107545, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.06264477106742561, "learning_rate": 8.155802186701984e-06, "loss": 0.0025, "num_tokens": 25687743.0, "reward": 1.9021739959716797, "reward_std": 0.1826234608888626, "rewards/fixed_code_pass_all_test_reward/mean": 0.9021739363670349, "rewards/fixed_code_pass_all_test_reward/std": 0.182623490691185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6035786755211215, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.09287602780386806, "learning_rate": 8.14947270756661e-06, "loss": 0.0037, "num_tokens": 25697479.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 510.75, "completions/mean_terminated_length": 510.75, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.6037631433314886, "frac_reward_zero_std": 1.0, "grad_norm": 0.03564453125, "kl": 0.018309422594029456, "learning_rate": 8.143143995989992e-06, "loss": 0.0007, "num_tokens": 25707477.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6039476111418558, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.050003305077552795, "learning_rate": 8.136816054597152e-06, "loss": 0.002, "num_tokens": 25714306.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6041320789522229, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.055089302361011505, "learning_rate": 8.130488886012777e-06, "loss": 0.0022, "num_tokens": 25720826.0, "reward": 1.220588207244873, "reward_std": 0.537979245185852, "rewards/fixed_code_pass_all_test_reward/mean": 0.3455882668495178, "rewards/fixed_code_pass_all_test_reward/std": 0.25628530979156494, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.60431654676259, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.07284242240712047, "learning_rate": 8.124162492861244e-06, "loss": 0.0029, "num_tokens": 25731091.0, "reward": 1.0250000953674316, "reward_std": 0.03874867036938667, "rewards/fixed_code_pass_all_test_reward/mean": 0.02500000037252903, "rewards/fixed_code_pass_all_test_reward/std": 0.03874865174293518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.604501014572957, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.07081936905160546, "learning_rate": 8.1178368777666e-06, "loss": 0.0028, "num_tokens": 25736972.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 288.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.6046854823833241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.037889977684244514, "learning_rate": 8.111512043352577e-06, "loss": 0.0015, "num_tokens": 25746117.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6048699501936912, "frac_reward_zero_std": 1.0, "grad_norm": 0.216796875, "kl": 0.04699895950034261, "learning_rate": 8.105187992242578e-06, "loss": 0.0019, "num_tokens": 25755007.0, "reward": 1.7471264600753784, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7471264600753784, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6050544180040582, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.11316480627283454, "learning_rate": 8.098864727059685e-06, "loss": 0.0045, "num_tokens": 25765750.0, "reward": 1.1193182468414307, "reward_std": 0.14766483008861542, "rewards/fixed_code_pass_all_test_reward/mean": 0.11931818723678589, "rewards/fixed_code_pass_all_test_reward/std": 0.14766483008861542, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6052388858144254, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.07001058617606759, "learning_rate": 8.09254225042665e-06, "loss": 0.0028, "num_tokens": 25773224.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6054233536247925, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.05991666717454791, "learning_rate": 8.086220564965899e-06, "loss": 0.0024, "num_tokens": 25781801.0, "reward": 1.9812500476837158, "reward_std": 0.03720121830701828, "rewards/fixed_code_pass_all_test_reward/mean": 0.981249988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.03720119968056679, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 366.625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.6056078214351596, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.0708762863650918, "learning_rate": 8.07989967329953e-06, "loss": 0.0028, "num_tokens": 25792054.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 443.625, "completions/mean_terminated_length": 443.625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6057922892455266, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.045524381566792727, "learning_rate": 8.073579578049318e-06, "loss": 0.0018, "num_tokens": 25801011.0, "reward": 1.7581169605255127, "reward_std": 0.059690870344638824, "rewards/fixed_code_pass_all_test_reward/mean": 0.7581168413162231, "rewards/fixed_code_pass_all_test_reward/std": 0.05969083309173584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 379.375, "completions/mean_terminated_length": 379.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.6059767570558937, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.05793734919279814, "learning_rate": 8.067260281836695e-06, "loss": 0.0023, "num_tokens": 25808486.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.6061612248662608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.04057767934864387, "learning_rate": 8.060941787282774e-06, "loss": 0.0016, "num_tokens": 25816226.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.606345692676628, "frac_reward_zero_std": 1.0, "grad_norm": 0.05908203125, "kl": 0.039127042749896646, "learning_rate": 8.054624097008328e-06, "loss": 0.0016, "num_tokens": 25828470.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 364.75, "completions/mean_terminated_length": 364.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.606530160486995, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.05139843688812107, "learning_rate": 8.048307213633798e-06, "loss": 0.0021, "num_tokens": 25838260.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 297.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.6067146282973621, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.036218631314113736, "learning_rate": 8.04199113977929e-06, "loss": 0.0014, "num_tokens": 25844882.0, "reward": 1.6755318641662598, "reward_std": 0.2514910399913788, "rewards/fixed_code_pass_all_test_reward/mean": 0.6755319833755493, "rewards/fixed_code_pass_all_test_reward/std": 0.25149106979370117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6068990961077292, "frac_reward_zero_std": 1.0, "grad_norm": 0.373046875, "kl": 0.0696930221747607, "learning_rate": 8.035675878064579e-06, "loss": 0.0028, "num_tokens": 25854269.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6070835639180963, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.04107871011365205, "learning_rate": 8.029361431109095e-06, "loss": 0.0016, "num_tokens": 25862672.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6072680317284633, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.05878895986825228, "learning_rate": 8.023047801531936e-06, "loss": 0.0024, "num_tokens": 25872197.0, "reward": 1.2579786777496338, "reward_std": 0.1093432754278183, "rewards/fixed_code_pass_all_test_reward/mean": 0.25797873735427856, "rewards/fixed_code_pass_all_test_reward/std": 0.10934330523014069, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6074524995388305, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.06577179906889796, "learning_rate": 8.016734991951859e-06, "loss": 0.0026, "num_tokens": 25880958.0, "reward": 1.4507575035095215, "reward_std": 0.27935078740119934, "rewards/fixed_code_pass_all_test_reward/mean": 0.45075759291648865, "rewards/fixed_code_pass_all_test_reward/std": 0.27935078740119934, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 406.75, "completions/mean_terminated_length": 406.75, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6076369673491976, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.034752997336909175, "learning_rate": 8.01042300498728e-06, "loss": 0.0014, "num_tokens": 25888324.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 387.875, "completions/mean_terminated_length": 387.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6078214351595647, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.07395045459270477, "learning_rate": 8.004111843256277e-06, "loss": 0.003, "num_tokens": 25894531.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6080059029699317, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.10316143836826086, "learning_rate": 7.997801509376581e-06, "loss": 0.0041, "num_tokens": 25902328.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.6081903707802988, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.059050204465165734, "learning_rate": 7.991492005965585e-06, "loss": 0.0024, "num_tokens": 25909568.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6083748385906659, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.047753133811056614, "learning_rate": 7.985183335640332e-06, "loss": 0.0019, "num_tokens": 25917148.0, "reward": 1.1691176891326904, "reward_std": 0.09133215993642807, "rewards/fixed_code_pass_all_test_reward/mean": 0.16911765933036804, "rewards/fixed_code_pass_all_test_reward/std": 0.09133220463991165, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6085593064010331, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.0384992768522352, "learning_rate": 7.978875501017526e-06, "loss": 0.0015, "num_tokens": 25923733.0, "reward": 1.7826087474822998, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.782608687877655, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 376.0, "completions/mean_terminated_length": 376.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6087437742114001, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.03806976368650794, "learning_rate": 7.972568504713516e-06, "loss": 0.0015, "num_tokens": 25929637.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.6089282420217672, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.07426609890535474, "learning_rate": 7.966262349344312e-06, "loss": 0.003, "num_tokens": 25937429.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 302.625, "completions/mean_terminated_length": 302.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6091127098321343, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.07925684959627688, "learning_rate": 7.95995703752557e-06, "loss": 0.0032, "num_tokens": 25947154.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 279.125, "completions/mean_terminated_length": 279.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6092971776425014, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.04115933575667441, "learning_rate": 7.953652571872594e-06, "loss": 0.0016, "num_tokens": 25952739.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6094816454528684, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.06244470411911607, "learning_rate": 7.947348955000345e-06, "loss": 0.0025, "num_tokens": 25964732.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 463.5, "completions/mean_terminated_length": 463.5, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.6096661132632356, "frac_reward_zero_std": 1.0, "grad_norm": 0.04443359375, "kl": 0.034013689728453755, "learning_rate": 7.941046189523422e-06, "loss": 0.0014, "num_tokens": 25974280.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.6098505810736027, "frac_reward_zero_std": 1.0, "grad_norm": 0.55078125, "kl": 0.06648468459025025, "learning_rate": 7.93474427805608e-06, "loss": 0.0027, "num_tokens": 25979572.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 145.875, "completions/mean_terminated_length": 145.875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.6100350488839698, "frac_reward_zero_std": 0.0, "grad_norm": 3.765625, "kl": 0.05690839048475027, "learning_rate": 7.928443223212216e-06, "loss": 0.0023, "num_tokens": 25987507.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 316.75, "completions/mean_terminated_length": 316.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.6102195166943368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.025407888053450733, "learning_rate": 7.922143027605369e-06, "loss": 0.001, "num_tokens": 25993481.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.6104039845047039, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.05157643649727106, "learning_rate": 7.915843693848725e-06, "loss": 0.0021, "num_tokens": 26002945.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 230.75, "completions/mean_terminated_length": 230.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.610588452315071, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.08085212437435985, "learning_rate": 7.909545224555113e-06, "loss": 0.0032, "num_tokens": 26010007.0, "reward": 1.1975308656692505, "reward_std": 0.5794020295143127, "rewards/fixed_code_pass_all_test_reward/mean": 0.3225308656692505, "rewards/fixed_code_pass_all_test_reward/std": 0.3443181812763214, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6107729201254382, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.08979694545269012, "learning_rate": 7.903247622336998e-06, "loss": 0.0036, "num_tokens": 26017076.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.6109573879358052, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.08600339852273464, "learning_rate": 7.896950889806496e-06, "loss": 0.0034, "num_tokens": 26026797.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.6111418557461723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.04582513612695038, "learning_rate": 7.890655029575352e-06, "loss": 0.0018, "num_tokens": 26032132.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.6113263235565394, "frac_reward_zero_std": 0.0, "grad_norm": 3.84375, "kl": 0.06826796545647085, "learning_rate": 7.88436004425495e-06, "loss": 0.0027, "num_tokens": 26040086.0, "reward": 1.2836538553237915, "reward_std": 0.5527681708335876, "rewards/fixed_code_pass_all_test_reward/mean": 0.4086538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.25257551670074463, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 446.0, "completions/mean_terminated_length": 446.0, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.6115107913669064, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.036256005987524986, "learning_rate": 7.878065936456319e-06, "loss": 0.0015, "num_tokens": 26049526.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 218.5, "completions/mean_terminated_length": 218.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6116952591772735, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.042537715984508395, "learning_rate": 7.871772708790114e-06, "loss": 0.0017, "num_tokens": 26058386.0, "reward": 1.8250000476837158, "reward_std": 0.3240370750427246, "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.32403701543807983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.6118797269876407, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.029152492294088006, "learning_rate": 7.865480363866636e-06, "loss": 0.0012, "num_tokens": 26064433.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 266.75, "completions/mean_terminated_length": 266.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.6120641947980078, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.07142718182876706, "learning_rate": 7.859188904295806e-06, "loss": 0.0029, "num_tokens": 26073039.0, "reward": 0.9027777910232544, "reward_std": 0.3682146370410919, "rewards/fixed_code_pass_all_test_reward/mean": 0.02777777798473835, "rewards/fixed_code_pass_all_test_reward/std": 0.05143444985151291, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6122486626083748, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.06767069455236197, "learning_rate": 7.852898332687196e-06, "loss": 0.0027, "num_tokens": 26077027.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.6124331304187419, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.08393948315642774, "learning_rate": 7.846608651649989e-06, "loss": 0.0034, "num_tokens": 26081697.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 421.75, "completions/mean_terminated_length": 421.75, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.612617598229109, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.03190221078693867, "learning_rate": 7.840319863793016e-06, "loss": 0.0013, "num_tokens": 26094119.0, "reward": 1.9440104961395264, "reward_std": 0.11176696419715881, "rewards/fixed_code_pass_all_test_reward/mean": 0.9440104365348816, "rewards/fixed_code_pass_all_test_reward/std": 0.1117669939994812, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6128020660394761, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.044703597901389, "learning_rate": 7.834031971724728e-06, "loss": 0.0018, "num_tokens": 26098286.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 274.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6129865338498433, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.06222416553646326, "learning_rate": 7.8277449780532e-06, "loss": 0.0025, "num_tokens": 26107802.0, "reward": 1.5178571939468384, "reward_std": 0.29758501052856445, "rewards/fixed_code_pass_all_test_reward/mean": 0.5178571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.29758504033088684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 360.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.6131710016602103, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.07494595414027572, "learning_rate": 7.821458885386153e-06, "loss": 0.003, "num_tokens": 26118207.0, "reward": 1.5, "reward_std": 0.4672199487686157, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.4477022588253021, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 239.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.6133554694705774, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.076203728094697, "learning_rate": 7.81517369633092e-06, "loss": 0.003, "num_tokens": 26126664.0, "reward": 1.889423131942749, "reward_std": 0.31275880336761475, "rewards/fixed_code_pass_all_test_reward/mean": 0.8894230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.31275877356529236, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6135399372809445, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.07683886494487524, "learning_rate": 7.808889413494453e-06, "loss": 0.0031, "num_tokens": 26130762.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 323.625, "completions/mean_terminated_length": 323.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6137244050913115, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.03822963731363416, "learning_rate": 7.802606039483347e-06, "loss": 0.0015, "num_tokens": 26137455.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 200.875, "completions/mean_terminated_length": 200.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6139088729016786, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.09492160612717271, "learning_rate": 7.796323576903803e-06, "loss": 0.0038, "num_tokens": 26144718.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6140933407120458, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.05806052195839584, "learning_rate": 7.790042028361656e-06, "loss": 0.0023, "num_tokens": 26152142.0, "reward": 1.9088234901428223, "reward_std": 0.19339051842689514, "rewards/fixed_code_pass_all_test_reward/mean": 0.908823549747467, "rewards/fixed_code_pass_all_test_reward/std": 0.19339054822921753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 380.875, "completions/mean_terminated_length": 380.875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6142778085224129, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.02099566941615194, "learning_rate": 7.783761396462353e-06, "loss": 0.0008, "num_tokens": 26160565.0, "reward": 1.8670213222503662, "reward_std": 0.11914350092411041, "rewards/fixed_code_pass_all_test_reward/mean": 0.8670213222503662, "rewards/fixed_code_pass_all_test_reward/std": 0.1191435232758522, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.6144622763327799, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.06488402746617794, "learning_rate": 7.777481683810967e-06, "loss": 0.0026, "num_tokens": 26168505.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 652.25, "completions/mean_terminated_length": 652.25, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.614646744143147, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.05513720726594329, "learning_rate": 7.771202893012182e-06, "loss": 0.0022, "num_tokens": 26184771.0, "reward": 1.4821428060531616, "reward_std": 0.444844126701355, "rewards/fixed_code_pass_all_test_reward/mean": 0.6071428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.4302687644958496, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 550.25, "completions/mean_terminated_length": 550.25, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.6148312119535141, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.04770561831537634, "learning_rate": 7.764925026670312e-06, "loss": 0.0019, "num_tokens": 26199581.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.6150156797638812, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.07697737962007523, "learning_rate": 7.758648087389277e-06, "loss": 0.0031, "num_tokens": 26206829.0, "reward": 1.7604167461395264, "reward_std": 0.42125454545021057, "rewards/fixed_code_pass_all_test_reward/mean": 0.7604166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.4212545156478882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 294.0, "completions/mean_terminated_length": 294.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.6152001475742483, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.06257919268682599, "learning_rate": 7.752372077772614e-06, "loss": 0.0025, "num_tokens": 26213773.0, "reward": 1.459302306175232, "reward_std": 0.17656488716602325, "rewards/fixed_code_pass_all_test_reward/mean": 0.4593023359775543, "rewards/fixed_code_pass_all_test_reward/std": 0.17656488716602325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6153846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.05421578628011048, "learning_rate": 7.74609700042348e-06, "loss": 0.0022, "num_tokens": 26218672.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6155690831949825, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.046256598085165024, "learning_rate": 7.73982285794464e-06, "loss": 0.0019, "num_tokens": 26224696.0, "reward": 1.8482142686843872, "reward_std": 0.35083720088005066, "rewards/fixed_code_pass_all_test_reward/mean": 0.8482142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.35083720088005066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6157535510053496, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.05987920751795173, "learning_rate": 7.733549652938472e-06, "loss": 0.0024, "num_tokens": 26232661.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6159380188157166, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.050213148817420006, "learning_rate": 7.727277388006964e-06, "loss": 0.002, "num_tokens": 26242004.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 286.5, "completions/mean_terminated_length": 286.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6161224866260837, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.0733923395164311, "learning_rate": 7.721006065751723e-06, "loss": 0.0029, "num_tokens": 26247848.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.6163069544364509, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.06271887896582484, "learning_rate": 7.71473568877395e-06, "loss": 0.0025, "num_tokens": 26251644.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 927.625, "completions/mean_terminated_length": 927.625, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.616491422246818, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.02783687342889607, "learning_rate": 7.708466259674468e-06, "loss": 0.0011, "num_tokens": 26270873.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 323.75, "completions/mean_terminated_length": 323.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.616675890057185, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.057571186451241374, "learning_rate": 7.702197781053696e-06, "loss": 0.0023, "num_tokens": 26280471.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6168603578675521, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.04996775765903294, "learning_rate": 7.695930255511669e-06, "loss": 0.002, "num_tokens": 26287008.0, "reward": 1.6556122303009033, "reward_std": 0.10823062062263489, "rewards/fixed_code_pass_all_test_reward/mean": 0.6556122303009033, "rewards/fixed_code_pass_all_test_reward/std": 0.10823064297437668, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 394.125, "completions/mean_terminated_length": 394.125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.6170448256779192, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.034489622339606285, "learning_rate": 7.689663685648018e-06, "loss": 0.0014, "num_tokens": 26295553.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 304.875, "completions/mean_terminated_length": 304.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.6172292934882863, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.043727528071030974, "learning_rate": 7.683398074061979e-06, "loss": 0.0017, "num_tokens": 26304832.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 410.25, "completions/mean_terminated_length": 410.25, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.6174137612986533, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.07187942275777459, "learning_rate": 7.677133423352397e-06, "loss": 0.0029, "num_tokens": 26317298.0, "reward": 1.672619104385376, "reward_std": 0.1858494132757187, "rewards/fixed_code_pass_all_test_reward/mean": 0.6726190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.18584942817687988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.6175982291090205, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.05540585587732494, "learning_rate": 7.670869736117708e-06, "loss": 0.0022, "num_tokens": 26322715.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 181.5, "completions/mean_terminated_length": 181.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6177826969193876, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.06146111665293574, "learning_rate": 7.664607014955961e-06, "loss": 0.0025, "num_tokens": 26327039.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.6179671647297547, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.09234107611700892, "learning_rate": 7.65834526246479e-06, "loss": 0.0037, "num_tokens": 26335883.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 180.125, "completions/mean_terminated_length": 180.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.6181516325401217, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.10733635677024722, "learning_rate": 7.652084481241442e-06, "loss": 0.0043, "num_tokens": 26340108.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6183361003504888, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.05379103636369109, "learning_rate": 7.64582467388275e-06, "loss": 0.0022, "num_tokens": 26351130.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 279.125, "completions/mean_terminated_length": 279.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6185205681608559, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0942097743973136, "learning_rate": 7.639565842985148e-06, "loss": 0.0038, "num_tokens": 26356387.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6187050359712231, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.07706864783540368, "learning_rate": 7.633307991144664e-06, "loss": 0.0031, "num_tokens": 26367772.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.6188895037815901, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.05360378907062113, "learning_rate": 7.627051120956917e-06, "loss": 0.0021, "num_tokens": 26371570.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6190739715919572, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.054614292457699776, "learning_rate": 7.620795235017126e-06, "loss": 0.0022, "num_tokens": 26378558.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.6192584394023243, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05296750459820032, "learning_rate": 7.614540335920093e-06, "loss": 0.0021, "num_tokens": 26385055.0, "reward": 1.5, "reward_std": 0.35376209020614624, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.35376203060150146, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6194429072126914, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.07554989866912365, "learning_rate": 7.60828642626022e-06, "loss": 0.003, "num_tokens": 26395135.0, "reward": 1.9663461446762085, "reward_std": 0.0951874852180481, "rewards/fixed_code_pass_all_test_reward/mean": 0.9663461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.09518745541572571, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6196273750230584, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.07324827648699284, "learning_rate": 7.602033508631488e-06, "loss": 0.0029, "num_tokens": 26404532.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 348.75, "completions/mean_terminated_length": 348.75, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6198118428334256, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.059371348936110735, "learning_rate": 7.595781585627478e-06, "loss": 0.0024, "num_tokens": 26414634.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.6199963106437927, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.052618196699768305, "learning_rate": 7.589530659841349e-06, "loss": 0.0021, "num_tokens": 26420652.0, "reward": 1.4342105388641357, "reward_std": 0.23156186938285828, "rewards/fixed_code_pass_all_test_reward/mean": 0.43421053886413574, "rewards/fixed_code_pass_all_test_reward/std": 0.23156186938285828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6201807784541598, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.04258507466875017, "learning_rate": 7.583280733865852e-06, "loss": 0.0017, "num_tokens": 26426998.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6203652462645268, "frac_reward_zero_std": 1.0, "grad_norm": 0.22265625, "kl": 0.07047036220319569, "learning_rate": 7.577031810293316e-06, "loss": 0.0028, "num_tokens": 26435527.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.6205497140748939, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.04087355500087142, "learning_rate": 7.5707838917156655e-06, "loss": 0.0016, "num_tokens": 26444682.0, "reward": 1.0317460298538208, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0317460335791111, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.620734181885261, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.03892953158356249, "learning_rate": 7.564536980724404e-06, "loss": 0.0016, "num_tokens": 26451020.0, "reward": 1.9945652484893799, "reward_std": 0.007500702515244484, "rewards/fixed_code_pass_all_test_reward/mean": 0.9945652484893799, "rewards/fixed_code_pass_all_test_reward/std": 0.0075007108971476555, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.6209186496956282, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.06441773194819689, "learning_rate": 7.558291079910611e-06, "loss": 0.0026, "num_tokens": 26457452.0, "reward": 1.75, "reward_std": 0.1511857956647873, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.1511857956647873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 1055.0, "completions/mean_terminated_length": 1055.0, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.6211031175059952, "frac_reward_zero_std": 0.0, "grad_norm": 0.349609375, "kl": 0.012395594618283212, "learning_rate": 7.5520461918649505e-06, "loss": 0.0005, "num_tokens": 26478004.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 463.625, "completions/mean_terminated_length": 463.625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.6212875853163623, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.05261903698556125, "learning_rate": 7.54580231917767e-06, "loss": 0.0021, "num_tokens": 26487657.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 237.25, "completions/mean_terminated_length": 237.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6214720531267294, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.08373894169926643, "learning_rate": 7.53955946443859e-06, "loss": 0.0033, "num_tokens": 26497731.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6216565209370964, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.04986072634346783, "learning_rate": 7.533317630237117e-06, "loss": 0.002, "num_tokens": 26507515.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6218409887474635, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.06434177025221288, "learning_rate": 7.527076819162222e-06, "loss": 0.0026, "num_tokens": 26513352.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6220254565578307, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.05773958610370755, "learning_rate": 7.520837033802466e-06, "loss": 0.0023, "num_tokens": 26520661.0, "reward": 1.9191176891326904, "reward_std": 0.22876986861228943, "rewards/fixed_code_pass_all_test_reward/mean": 0.9191176295280457, "rewards/fixed_code_pass_all_test_reward/std": 0.22876983880996704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6222099243681978, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.10142938699573278, "learning_rate": 7.514598276745971e-06, "loss": 0.0041, "num_tokens": 26531380.0, "reward": 1.9513888359069824, "reward_std": 0.13749296963214874, "rewards/fixed_code_pass_all_test_reward/mean": 0.9513888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.13749298453330994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6223943921785648, "frac_reward_zero_std": 1.0, "grad_norm": 0.263671875, "kl": 0.10089533729478717, "learning_rate": 7.508360550580445e-06, "loss": 0.004, "num_tokens": 26534974.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6225788599889319, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0673963944427669, "learning_rate": 7.502123857893154e-06, "loss": 0.0027, "num_tokens": 26543515.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.622763327799299, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.08622114919126034, "learning_rate": 7.495888201270951e-06, "loss": 0.0034, "num_tokens": 26554487.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6229477956096661, "frac_reward_zero_std": 1.0, "grad_norm": 0.474609375, "kl": 0.07468194328248501, "learning_rate": 7.489653583300249e-06, "loss": 0.003, "num_tokens": 26562463.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.6231322634200332, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.10291409026831388, "learning_rate": 7.48342000656703e-06, "loss": 0.0041, "num_tokens": 26570233.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6233167312304003, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.06961537571623921, "learning_rate": 7.477187473656853e-06, "loss": 0.0028, "num_tokens": 26578298.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 438.25, "completions/mean_terminated_length": 438.25, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6235011990407674, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109375, "kl": 0.027460883371531963, "learning_rate": 7.470955987154831e-06, "loss": 0.0011, "num_tokens": 26589332.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 383.25, "completions/mean_terminated_length": 383.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6236856668511345, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.030979155795648694, "learning_rate": 7.464725549645657e-06, "loss": 0.0012, "num_tokens": 26597262.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.6238701346615015, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.08972385991364717, "learning_rate": 7.458496163713576e-06, "loss": 0.0036, "num_tokens": 26604493.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.6240546024718686, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.04372918698936701, "learning_rate": 7.452267831942408e-06, "loss": 0.0017, "num_tokens": 26608981.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.6242390702822358, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.10180951468646526, "learning_rate": 7.446040556915526e-06, "loss": 0.0041, "num_tokens": 26617327.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 388.5, "completions/mean_terminated_length": 388.5, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.6244235380926029, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.05401134630665183, "learning_rate": 7.439814341215872e-06, "loss": 0.0022, "num_tokens": 26627171.0, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6246080059029699, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.04014529357664287, "learning_rate": 7.433589187425946e-06, "loss": 0.0016, "num_tokens": 26632943.0, "reward": 1.93359375, "reward_std": 0.18782523274421692, "rewards/fixed_code_pass_all_test_reward/mean": 0.93359375, "rewards/fixed_code_pass_all_test_reward/std": 0.18782523274421692, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.624792473713337, "frac_reward_zero_std": 1.0, "grad_norm": 0.87890625, "kl": 0.12462977156974375, "learning_rate": 7.427365098127803e-06, "loss": 0.005, "num_tokens": 26641222.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 382.625, "completions/mean_terminated_length": 382.625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6249769415237041, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.046450185123831034, "learning_rate": 7.421142075903067e-06, "loss": 0.0019, "num_tokens": 26652979.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6251614093340712, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.05388401937671006, "learning_rate": 7.4149201233329096e-06, "loss": 0.0022, "num_tokens": 26658050.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 308.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.6253458771444383, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.08459887281060219, "learning_rate": 7.408699242998064e-06, "loss": 0.0034, "num_tokens": 26666965.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 262.5, "completions/mean_terminated_length": 262.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6255303449548054, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.07600662717595696, "learning_rate": 7.402479437478815e-06, "loss": 0.003, "num_tokens": 26676169.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 327.625, "completions/mean_terminated_length": 327.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6257148127651725, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.062000686302781105, "learning_rate": 7.396260709355005e-06, "loss": 0.0025, "num_tokens": 26686398.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 372.0, "completions/mean_terminated_length": 372.0, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6258992805755396, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.022797331563197076, "learning_rate": 7.390043061206028e-06, "loss": 0.0009, "num_tokens": 26694550.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 275.375, "completions/mean_terminated_length": 275.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.6260837483859066, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.03108350478578359, "learning_rate": 7.383826495610831e-06, "loss": 0.0012, "num_tokens": 26699585.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 428.25, "completions/mean_terminated_length": 428.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6262682161962737, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.06465544155798852, "learning_rate": 7.37761101514791e-06, "loss": 0.0026, "num_tokens": 26710259.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 564.125, "completions/mean_terminated_length": 564.125, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.6264526840066409, "frac_reward_zero_std": 1.0, "grad_norm": 0.051025390625, "kl": 0.036143806762993336, "learning_rate": 7.371396622395313e-06, "loss": 0.0014, "num_tokens": 26720692.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.626637151817008, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.056550185894593596, "learning_rate": 7.3651833199306355e-06, "loss": 0.0023, "num_tokens": 26729784.0, "reward": 1.3392856121063232, "reward_std": 0.2850758135318756, "rewards/fixed_code_pass_all_test_reward/mean": 0.3392857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.285075843334198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 305.25, "completions/mean_terminated_length": 305.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.626821619627375, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.0359369064681232, "learning_rate": 7.358971110331019e-06, "loss": 0.0014, "num_tokens": 26736762.0, "reward": 1.8571429252624512, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 880.125, "completions/mean_terminated_length": 880.125, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 0.6270060874377421, "frac_reward_zero_std": 0.0, "grad_norm": 0.5234375, "kl": 0.024893052846891806, "learning_rate": 7.3527599961731575e-06, "loss": 0.001, "num_tokens": 26752611.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 318.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.6271905552481092, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.036876096623018384, "learning_rate": 7.346549980033284e-06, "loss": 0.0015, "num_tokens": 26763740.0, "reward": 1.9772727489471436, "reward_std": 0.06428244709968567, "rewards/fixed_code_pass_all_test_reward/mean": 0.9772727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 307.5, "completions/mean_terminated_length": 307.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6273750230584763, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.09042846178635955, "learning_rate": 7.3403410644871834e-06, "loss": 0.0036, "num_tokens": 26772784.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6275594908688434, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.03892206365708262, "learning_rate": 7.334133252110174e-06, "loss": 0.0016, "num_tokens": 26777050.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.6277439586792105, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.05059746582992375, "learning_rate": 7.327926545477123e-06, "loss": 0.002, "num_tokens": 26786399.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 490.375, "completions/mean_terminated_length": 490.375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.6279284264895776, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.025717539247125387, "learning_rate": 7.321720947162445e-06, "loss": 0.001, "num_tokens": 26799690.0, "reward": 1.355769157409668, "reward_std": 0.28169724345207214, "rewards/fixed_code_pass_all_test_reward/mean": 0.35576921701431274, "rewards/fixed_code_pass_all_test_reward/std": 0.28169727325439453, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 214.625, "completions/mean_terminated_length": 214.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6281128942999447, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.036148205399513245, "learning_rate": 7.315516459740082e-06, "loss": 0.0014, "num_tokens": 26805375.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 327.25, "completions/mean_terminated_length": 327.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.6282973621103117, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.03209987422451377, "learning_rate": 7.3093130857835245e-06, "loss": 0.0013, "num_tokens": 26811665.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6284818299206788, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0591207523830235, "learning_rate": 7.3031108278657956e-06, "loss": 0.0024, "num_tokens": 26819509.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.628666297731046, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.04073520144447684, "learning_rate": 7.296909688559462e-06, "loss": 0.0016, "num_tokens": 26825482.0, "reward": 1.197115421295166, "reward_std": 0.12238387763500214, "rewards/fixed_code_pass_all_test_reward/mean": 0.19711539149284363, "rewards/fixed_code_pass_all_test_reward/std": 0.12238387018442154, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6288507655414131, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.044894269201904535, "learning_rate": 7.290709670436618e-06, "loss": 0.0018, "num_tokens": 26831881.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6290352333517801, "frac_reward_zero_std": 1.0, "grad_norm": 0.205078125, "kl": 0.053354858537204564, "learning_rate": 7.284510776068896e-06, "loss": 0.0021, "num_tokens": 26842586.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6292197011621472, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.046437570825219154, "learning_rate": 7.278313008027469e-06, "loss": 0.0019, "num_tokens": 26851899.0, "reward": 1.2861841917037964, "reward_std": 0.28842511773109436, "rewards/fixed_code_pass_all_test_reward/mean": 0.2861842215061188, "rewards/fixed_code_pass_all_test_reward/std": 0.28842514753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 288.625, "completions/mean_terminated_length": 288.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.6294041689725143, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.09197411686182022, "learning_rate": 7.272116368883033e-06, "loss": 0.0037, "num_tokens": 26857792.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6295886367828814, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.036331275245174766, "learning_rate": 7.265920861205823e-06, "loss": 0.0015, "num_tokens": 26864496.0, "reward": 1.6964285373687744, "reward_std": 0.05050764977931976, "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 591.5, "completions/mean_terminated_length": 591.5, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.6297731045932484, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.05358920991420746, "learning_rate": 7.259726487565598e-06, "loss": 0.0021, "num_tokens": 26874388.0, "reward": 1.8928571939468384, "reward_std": 0.14787116646766663, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.14787118136882782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 416.0, "completions/mean_terminated_length": 416.0, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.6299575724036156, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.032816674560308456, "learning_rate": 7.253533250531656e-06, "loss": 0.0013, "num_tokens": 26883340.0, "reward": 1.9874999523162842, "reward_std": 0.035355329513549805, "rewards/fixed_code_pass_all_test_reward/mean": 0.987500011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553481400013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6301420402139827, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.0701943701133132, "learning_rate": 7.247341152672813e-06, "loss": 0.0028, "num_tokens": 26893027.0, "reward": 1.8245192766189575, "reward_std": 0.3331560790538788, "rewards/fixed_code_pass_all_test_reward/mean": 0.8245192766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.33315610885620117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 410.125, "completions/mean_terminated_length": 410.125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6303265080243498, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0498794608283788, "learning_rate": 7.24115019655742e-06, "loss": 0.002, "num_tokens": 26900748.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 386.5, "completions/mean_terminated_length": 386.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6305109758347168, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.05718769528903067, "learning_rate": 7.2349603847533524e-06, "loss": 0.0023, "num_tokens": 26908928.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.6306954436450839, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.07098955940455198, "learning_rate": 7.2287717198280075e-06, "loss": 0.0028, "num_tokens": 26918888.0, "reward": 1.2386363744735718, "reward_std": 0.06763848662376404, "rewards/fixed_code_pass_all_test_reward/mean": 0.23863637447357178, "rewards/fixed_code_pass_all_test_reward/std": 0.06763853132724762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 407.0, "completions/mean_terminated_length": 407.0, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.630879911455451, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.04452435369603336, "learning_rate": 7.222584204348313e-06, "loss": 0.0018, "num_tokens": 26929320.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.6310643792658182, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.0658290977589786, "learning_rate": 7.216397840880713e-06, "loss": 0.0026, "num_tokens": 26934228.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.6312488470761852, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.07559730904176831, "learning_rate": 7.2102126319911825e-06, "loss": 0.003, "num_tokens": 26940809.0, "reward": 1.6681034564971924, "reward_std": 0.27022412419319153, "rewards/fixed_code_pass_all_test_reward/mean": 0.6681034564971924, "rewards/fixed_code_pass_all_test_reward/std": 0.27022409439086914, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6314333148865523, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.07156076654791832, "learning_rate": 7.204028580245208e-06, "loss": 0.0029, "num_tokens": 26945153.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 366.625, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.6316177826969194, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.031657002633437514, "learning_rate": 7.197845688207805e-06, "loss": 0.0013, "num_tokens": 26951982.0, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.6318022505072864, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.07235288200899959, "learning_rate": 7.1916639584435e-06, "loss": 0.0029, "num_tokens": 26961460.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 439.0, "completions/mean_terminated_length": 439.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6319867183176535, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.056082650320604444, "learning_rate": 7.185483393516342e-06, "loss": 0.0022, "num_tokens": 26972060.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6321711861280207, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.04524471703916788, "learning_rate": 7.1793039959898954e-06, "loss": 0.0018, "num_tokens": 26976518.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.6323556539383878, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.08332480490207672, "learning_rate": 7.173125768427244e-06, "loss": 0.0033, "num_tokens": 26984662.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 260.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.6325401217487548, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.08977919956669211, "learning_rate": 7.16694871339098e-06, "loss": 0.0036, "num_tokens": 26994645.0, "reward": 1.691666603088379, "reward_std": 0.3786986768245697, "rewards/fixed_code_pass_all_test_reward/mean": 0.6916666626930237, "rewards/fixed_code_pass_all_test_reward/std": 0.3786986768245697, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6327245895591219, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.054486887995153666, "learning_rate": 7.160772833443211e-06, "loss": 0.0022, "num_tokens": 27000722.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.632909057369489, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.07816836284473538, "learning_rate": 7.154598131145563e-06, "loss": 0.0031, "num_tokens": 27009690.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 474.625, "completions/mean_terminated_length": 474.625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.6330935251798561, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.07401817524805665, "learning_rate": 7.148424609059166e-06, "loss": 0.003, "num_tokens": 27017879.0, "reward": 1.7410714626312256, "reward_std": 0.34770700335502625, "rewards/fixed_code_pass_all_test_reward/mean": 0.8660714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.18483899533748627, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6332779929902232, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.07655417360365391, "learning_rate": 7.142252269744665e-06, "loss": 0.0031, "num_tokens": 27022336.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 527.375, "completions/mean_terminated_length": 527.375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6334624608005903, "frac_reward_zero_std": 1.0, "grad_norm": 0.2470703125, "kl": 0.06906895944848657, "learning_rate": 7.136081115762212e-06, "loss": 0.0028, "num_tokens": 27033971.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.6336469286109574, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.05103369406424463, "learning_rate": 7.129911149671472e-06, "loss": 0.002, "num_tokens": 27039587.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6338313964213245, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0714669672306627, "learning_rate": 7.123742374031606e-06, "loss": 0.0029, "num_tokens": 27048660.0, "reward": 1.7828283309936523, "reward_std": 0.4031013548374176, "rewards/fixed_code_pass_all_test_reward/mean": 0.7828282713890076, "rewards/fixed_code_pass_all_test_reward/std": 0.4031013548374176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 319.75, "completions/mean_terminated_length": 319.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.6340158642316915, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.062286960426718, "learning_rate": 7.117574791401298e-06, "loss": 0.0025, "num_tokens": 27059674.0, "reward": 1.4895832538604736, "reward_std": 0.1439073532819748, "rewards/fixed_code_pass_all_test_reward/mean": 0.4895833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.1439073532819748, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 302.625, "completions/mean_terminated_length": 302.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6342003320420586, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.057185497134923935, "learning_rate": 7.11140840433872e-06, "loss": 0.0023, "num_tokens": 27069295.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 311.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.6343847998524258, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.04939037701115012, "learning_rate": 7.105243215401564e-06, "loss": 0.002, "num_tokens": 27081463.0, "reward": 1.3235294818878174, "reward_std": 0.5347866415977478, "rewards/fixed_code_pass_all_test_reward/mean": 0.4485294222831726, "rewards/fixed_code_pass_all_test_reward/std": 0.1812332570552826, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6345692676627929, "frac_reward_zero_std": 0.0, "grad_norm": 3.1875, "kl": 0.0806895773857832, "learning_rate": 7.099079227147012e-06, "loss": 0.0032, "num_tokens": 27088006.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 440.0, "completions/mean_terminated_length": 440.0, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.6347537354731599, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.03978939959779382, "learning_rate": 7.092916442131754e-06, "loss": 0.0016, "num_tokens": 27096398.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 408.375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.634938203283527, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.06925387820228934, "learning_rate": 7.086754862911982e-06, "loss": 0.0028, "num_tokens": 27102569.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.6351226710938941, "frac_reward_zero_std": 1.0, "grad_norm": 0.1845703125, "kl": 0.05569833633489907, "learning_rate": 7.08059449204338e-06, "loss": 0.0022, "num_tokens": 27106723.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 500.5, "completions/mean_terminated_length": 500.5, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.6353071389042612, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.03110878076404333, "learning_rate": 7.0744353320811435e-06, "loss": 0.0012, "num_tokens": 27116343.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6354916067146283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.056923771277070045, "learning_rate": 7.068277385579959e-06, "loss": 0.0023, "num_tokens": 27126460.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 386.75, "completions/mean_terminated_length": 386.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.6356760745249954, "frac_reward_zero_std": 1.0, "grad_norm": 0.037109375, "kl": 0.022183642722666264, "learning_rate": 7.062120655094007e-06, "loss": 0.0009, "num_tokens": 27134514.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.6358605423353625, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.07018017629161477, "learning_rate": 7.055965143176968e-06, "loss": 0.0028, "num_tokens": 27138794.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 369.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6360450101457296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.03580386331304908, "learning_rate": 7.049810852382013e-06, "loss": 0.0014, "num_tokens": 27146741.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6362294779560966, "frac_reward_zero_std": 1.0, "grad_norm": 0.19921875, "kl": 0.07930405181832612, "learning_rate": 7.043657785261812e-06, "loss": 0.0032, "num_tokens": 27151911.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 433.875, "completions/mean_terminated_length": 433.875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.6364139457664637, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.058135490166023374, "learning_rate": 7.0375059443685215e-06, "loss": 0.0023, "num_tokens": 27160526.0, "reward": 1.644230842590332, "reward_std": 0.13598209619522095, "rewards/fixed_code_pass_all_test_reward/mean": 0.6442307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.13598208129405975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6365984135768309, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.054007151862606406, "learning_rate": 7.031355332253795e-06, "loss": 0.0022, "num_tokens": 27168273.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 531.625, "completions/mean_terminated_length": 531.625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.636782881387198, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.034286344423890114, "learning_rate": 7.025205951468772e-06, "loss": 0.0014, "num_tokens": 27178246.0, "reward": 1.2916667461395264, "reward_std": 0.1178511381149292, "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.636967349197565, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.04852675017900765, "learning_rate": 7.0190578045640825e-06, "loss": 0.0019, "num_tokens": 27183163.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 304.875, "completions/mean_terminated_length": 304.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6371518170079321, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.09500947780907154, "learning_rate": 7.012910894089846e-06, "loss": 0.0038, "num_tokens": 27192098.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 246.375, "completions/mean_terminated_length": 246.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6373362848182992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.05043016956187785, "learning_rate": 7.0067652225956664e-06, "loss": 0.002, "num_tokens": 27199533.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6375207526286663, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.062386639416217804, "learning_rate": 7.000620792630639e-06, "loss": 0.0025, "num_tokens": 27205645.0, "reward": 1.9137930870056152, "reward_std": 0.11945178359746933, "rewards/fixed_code_pass_all_test_reward/mean": 0.9137930870056152, "rewards/fixed_code_pass_all_test_reward/std": 0.11945178359746933, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6377052204390334, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.057619214756414294, "learning_rate": 6.9944776067433375e-06, "loss": 0.0023, "num_tokens": 27213144.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 420.125, "completions/mean_terminated_length": 420.125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.6378896882494005, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05808654148131609, "learning_rate": 6.988335667481825e-06, "loss": 0.0023, "num_tokens": 27221833.0, "reward": 1.4553570747375488, "reward_std": 0.4516531229019165, "rewards/fixed_code_pass_all_test_reward/mean": 0.4553571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.4516531229019165, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.6380741560597676, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.04742952063679695, "learning_rate": 6.982194977393641e-06, "loss": 0.0019, "num_tokens": 27231516.0, "reward": 1.115384578704834, "reward_std": 0.07962294667959213, "rewards/fixed_code_pass_all_test_reward/mean": 0.11538462340831757, "rewards/fixed_code_pass_all_test_reward/std": 0.07962294667959213, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.6382586238701347, "frac_reward_zero_std": 1.0, "grad_norm": 0.169921875, "kl": 0.062274265103042126, "learning_rate": 6.976055539025819e-06, "loss": 0.0025, "num_tokens": 27236933.0, "reward": 1.6206896305084229, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6206896305084229, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 724.0, "completions/mean_terminated_length": 724.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6384430916805017, "frac_reward_zero_std": 0.0, "grad_norm": 4.09375, "kl": 0.03894087404478341, "learning_rate": 6.96991735492486e-06, "loss": 0.0016, "num_tokens": 27253261.0, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 216.625, "completions/mean_terminated_length": 216.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.6386275594908688, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.06827914156019688, "learning_rate": 6.963780427636748e-06, "loss": 0.0027, "num_tokens": 27261346.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 297.875, "completions/mean_terminated_length": 297.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.638812027301236, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.07782627525739372, "learning_rate": 6.957644759706954e-06, "loss": 0.0031, "num_tokens": 27268153.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.6389964951116031, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.07157603511586785, "learning_rate": 6.951510353680415e-06, "loss": 0.0029, "num_tokens": 27274274.0, "reward": 1.9500000476837158, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.6391809629219701, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.05371494474820793, "learning_rate": 6.945377212101554e-06, "loss": 0.0021, "num_tokens": 27280942.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.6393654307323372, "frac_reward_zero_std": 1.0, "grad_norm": 0.166015625, "kl": 0.049601655220612884, "learning_rate": 6.939245337514263e-06, "loss": 0.002, "num_tokens": 27284717.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 181.375, "completions/mean_terminated_length": 181.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6395498985427043, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.04647269379347563, "learning_rate": 6.933114732461914e-06, "loss": 0.0019, "num_tokens": 27288840.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6397343663530713, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.06405734783038497, "learning_rate": 6.926985399487347e-06, "loss": 0.0026, "num_tokens": 27296578.0, "reward": 1.9874999523162842, "reward_std": 0.035355329513549805, "rewards/fixed_code_pass_all_test_reward/mean": 0.987500011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553481400013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6399188341634385, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.08573625795543194, "learning_rate": 6.92085734113288e-06, "loss": 0.0034, "num_tokens": 27304522.0, "reward": 1.0201612710952759, "reward_std": 0.016695119440555573, "rewards/fixed_code_pass_all_test_reward/mean": 0.02016128972172737, "rewards/fixed_code_pass_all_test_reward/std": 0.016695134341716766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6401033019738056, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.04086292232386768, "learning_rate": 6.914730559940295e-06, "loss": 0.0016, "num_tokens": 27311263.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.6402877697841727, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.058121690060943365, "learning_rate": 6.9086050584508525e-06, "loss": 0.0023, "num_tokens": 27315917.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 381.0, "completions/mean_terminated_length": 381.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6404722375945398, "frac_reward_zero_std": 1.0, "grad_norm": 0.048095703125, "kl": 0.032763579452876, "learning_rate": 6.902480839205277e-06, "loss": 0.0013, "num_tokens": 27323813.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6406567054049068, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.046212319983169436, "learning_rate": 6.8963579047437605e-06, "loss": 0.0018, "num_tokens": 27328269.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 336.375, "completions/mean_terminated_length": 336.375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.6408411732152739, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.04816126055084169, "learning_rate": 6.890236257605967e-06, "loss": 0.0019, "num_tokens": 27335920.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 486.25, "completions/mean_terminated_length": 486.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.6410256410256411, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.048981004394590855, "learning_rate": 6.88411590033102e-06, "loss": 0.002, "num_tokens": 27345978.0, "reward": 1.8854167461395264, "reward_std": 0.3240906000137329, "rewards/fixed_code_pass_all_test_reward/mean": 0.8854166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6412101088360082, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.06168783246539533, "learning_rate": 6.877996835457515e-06, "loss": 0.0025, "num_tokens": 27352127.0, "reward": 1.4287633895874023, "reward_std": 0.4730287790298462, "rewards/fixed_code_pass_all_test_reward/mean": 0.4287634491920471, "rewards/fixed_code_pass_all_test_reward/std": 0.4730288088321686, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 119.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6413945766463752, "frac_reward_zero_std": 1.0, "grad_norm": 0.1845703125, "kl": 0.03898730146465823, "learning_rate": 6.871879065523505e-06, "loss": 0.0016, "num_tokens": 27355868.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 335.625, "completions/mean_terminated_length": 335.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6415790444567423, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.10824339231476188, "learning_rate": 6.865762593066514e-06, "loss": 0.0043, "num_tokens": 27366857.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.6417635122671094, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.060188284143805504, "learning_rate": 6.859647420623515e-06, "loss": 0.0024, "num_tokens": 27371530.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 365.75, "completions/mean_terminated_length": 365.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6419479800774764, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.03213056304957718, "learning_rate": 6.853533550730958e-06, "loss": 0.0013, "num_tokens": 27378232.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 336.125, "completions/mean_terminated_length": 336.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6421324478878435, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.07865431671962142, "learning_rate": 6.847420985924737e-06, "loss": 0.0031, "num_tokens": 27387537.0, "reward": 1.25, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6423169156982107, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.06185546610504389, "learning_rate": 6.841309728740214e-06, "loss": 0.0025, "num_tokens": 27397654.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6425013835085778, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.07139132195152342, "learning_rate": 6.835199781712205e-06, "loss": 0.0029, "num_tokens": 27406189.0, "reward": 1.8863635063171387, "reward_std": 0.09409989416599274, "rewards/fixed_code_pass_all_test_reward/mean": 0.8863636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.09409984946250916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 349.0, "completions/mean_terminated_length": 349.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6426858513189448, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.04061998857650906, "learning_rate": 6.829091147374991e-06, "loss": 0.0016, "num_tokens": 27412469.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 201.75, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.6428703191293119, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.0684271939098835, "learning_rate": 6.822983828262294e-06, "loss": 0.0027, "num_tokens": 27416859.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 253.0, "completions/mean_terminated_length": 253.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.643054786939679, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.13469182513654232, "learning_rate": 6.816877826907304e-06, "loss": 0.0054, "num_tokens": 27421755.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6432392547500461, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.05690271011553705, "learning_rate": 6.810773145842653e-06, "loss": 0.0023, "num_tokens": 27428070.0, "reward": 1.712499976158142, "reward_std": 0.24164614081382751, "rewards/fixed_code_pass_all_test_reward/mean": 0.7124999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.24164614081382751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6434237225604132, "frac_reward_zero_std": 1.0, "grad_norm": 0.3203125, "kl": 0.06079932535067201, "learning_rate": 6.8046697876004354e-06, "loss": 0.0024, "num_tokens": 27436343.0, "reward": 1.7285714149475098, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7285714149475098, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6436081903707803, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.07232512114569545, "learning_rate": 6.798567754712187e-06, "loss": 0.0029, "num_tokens": 27441330.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 185.75, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6437926581811474, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.05397963663563132, "learning_rate": 6.792467049708906e-06, "loss": 0.0022, "num_tokens": 27449736.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 305.5, "completions/mean_terminated_length": 305.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.6439771259915145, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0906495307572186, "learning_rate": 6.786367675121028e-06, "loss": 0.0036, "num_tokens": 27459828.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 514.25, "completions/mean_terminated_length": 514.25, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.6441615938018815, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.037488566944375634, "learning_rate": 6.780269633478447e-06, "loss": 0.0015, "num_tokens": 27469838.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6443460616122486, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.04728831211104989, "learning_rate": 6.7741729273104935e-06, "loss": 0.0019, "num_tokens": 27474703.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.6445305294226158, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.04425991442985833, "learning_rate": 6.768077559145951e-06, "loss": 0.0018, "num_tokens": 27484444.0, "reward": 1.7471264600753784, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7471264600753784, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 177.375, "completions/mean_terminated_length": 177.375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.6447149972329829, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "kl": 0.13526874827221036, "learning_rate": 6.761983531513048e-06, "loss": 0.0054, "num_tokens": 27488655.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 387.5, "completions/mean_terminated_length": 387.5, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.6448994650433499, "frac_reward_zero_std": 1.0, "grad_norm": 0.033447265625, "kl": 0.014006309385877103, "learning_rate": 6.755890846939454e-06, "loss": 0.0006, "num_tokens": 27496203.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 277.625, "completions/mean_terminated_length": 277.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.645083932853717, "frac_reward_zero_std": 1.0, "grad_norm": 0.1767578125, "kl": 0.07205021940171719, "learning_rate": 6.749799507952285e-06, "loss": 0.0029, "num_tokens": 27505184.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.6452684006640841, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.029827337712049484, "learning_rate": 6.743709517078094e-06, "loss": 0.0012, "num_tokens": 27513703.0, "reward": 1.4444444179534912, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4444444477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 523.375, "completions/mean_terminated_length": 523.375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.6454528684744512, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.03436588787008077, "learning_rate": 6.737620876842882e-06, "loss": 0.0014, "num_tokens": 27527898.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6456373362848183, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.03747973119607195, "learning_rate": 6.73153358977208e-06, "loss": 0.0015, "num_tokens": 27533709.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6458218040951854, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.05080918734893203, "learning_rate": 6.725447658390569e-06, "loss": 0.002, "num_tokens": 27537740.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 490.25, "completions/mean_terminated_length": 490.25, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.6460062719055525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0218505859375, "kl": 0.019035870267543942, "learning_rate": 6.719363085222656e-06, "loss": 0.0008, "num_tokens": 27546998.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.6461907397159196, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.029949702322483063, "learning_rate": 6.713279872792098e-06, "loss": 0.0012, "num_tokens": 27551695.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.6463752075262866, "frac_reward_zero_std": 1.0, "grad_norm": 0.212890625, "kl": 0.042732295114547014, "learning_rate": 6.707198023622074e-06, "loss": 0.0017, "num_tokens": 27556291.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6465596753366537, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.08038721373304725, "learning_rate": 6.701117540235204e-06, "loss": 0.0032, "num_tokens": 27562060.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6467441431470209, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.08034749608486891, "learning_rate": 6.6950384251535436e-06, "loss": 0.0032, "num_tokens": 27568296.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.646928610957388, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.07465152209624648, "learning_rate": 6.688960680898578e-06, "loss": 0.003, "num_tokens": 27576112.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.647113078767755, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.05335966683924198, "learning_rate": 6.682884309991223e-06, "loss": 0.0021, "num_tokens": 27586964.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.6472975465781221, "frac_reward_zero_std": 0.0, "grad_norm": 3.734375, "kl": 0.0844319041352719, "learning_rate": 6.676809314951827e-06, "loss": 0.0034, "num_tokens": 27593190.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6474820143884892, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.062117355642840266, "learning_rate": 6.670735698300168e-06, "loss": 0.0025, "num_tokens": 27600502.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 287.125, "completions/mean_terminated_length": 287.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6476664821988563, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.06516063818708062, "learning_rate": 6.6646634625554475e-06, "loss": 0.0026, "num_tokens": 27606311.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.6478509500092234, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.026544867258053273, "learning_rate": 6.658592610236302e-06, "loss": 0.0011, "num_tokens": 27611883.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 277.5, "completions/mean_terminated_length": 277.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.6480354178195905, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.050706715090200305, "learning_rate": 6.652523143860787e-06, "loss": 0.002, "num_tokens": 27621439.0, "reward": 1.4375, "reward_std": 0.25035685300827026, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.25035688281059265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 409.25, "completions/mean_terminated_length": 409.25, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.6482198856299576, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.03232401283457875, "learning_rate": 6.646455065946386e-06, "loss": 0.0013, "num_tokens": 27630273.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.6484043534403247, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.040331095457077026, "learning_rate": 6.64038837901001e-06, "loss": 0.0016, "num_tokens": 27634470.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.6485888212506917, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.10023286193609238, "learning_rate": 6.6343230855679855e-06, "loss": 0.004, "num_tokens": 27642795.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 199.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.6487732890610588, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.06893921340815723, "learning_rate": 6.628259188136068e-06, "loss": 0.0028, "num_tokens": 27647290.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.648957756871426, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.04177714907564223, "learning_rate": 6.62219668922943e-06, "loss": 0.0017, "num_tokens": 27655098.0, "reward": 1.7799999713897705, "reward_std": 0.4123798906803131, "rewards/fixed_code_pass_all_test_reward/mean": 0.9049999713897705, "rewards/fixed_code_pass_all_test_reward/std": 0.26870056986808777, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6491422246817931, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.09284602757543325, "learning_rate": 6.616135591362667e-06, "loss": 0.0037, "num_tokens": 27663994.0, "reward": 1.7701612710952759, "reward_std": 0.4256659150123596, "rewards/fixed_code_pass_all_test_reward/mean": 0.7701612710952759, "rewards/fixed_code_pass_all_test_reward/std": 0.4256659150123596, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 266.5, "completions/mean_terminated_length": 266.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6493266924921601, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.07063317182473838, "learning_rate": 6.610075897049788e-06, "loss": 0.0028, "num_tokens": 27672054.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6495111603025272, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.05624234303832054, "learning_rate": 6.604017608804227e-06, "loss": 0.0022, "num_tokens": 27676911.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6496956281128943, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.07205875916406512, "learning_rate": 6.597960729138827e-06, "loss": 0.0029, "num_tokens": 27685273.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 361.625, "completions/mean_terminated_length": 361.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6498800959232613, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.05811539897695184, "learning_rate": 6.5919052605658514e-06, "loss": 0.0023, "num_tokens": 27696366.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 228.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6500645637336285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.036644408479332924, "learning_rate": 6.585851205596984e-06, "loss": 0.0015, "num_tokens": 27702862.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6502490315439956, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.04749533487483859, "learning_rate": 6.579798566743314e-06, "loss": 0.0019, "num_tokens": 27707397.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6504334993543627, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.05527442344464362, "learning_rate": 6.573747346515339e-06, "loss": 0.0022, "num_tokens": 27715802.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 364.5, "completions/mean_terminated_length": 364.5, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.6506179671647297, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.05404942180030048, "learning_rate": 6.567697547422981e-06, "loss": 0.0022, "num_tokens": 27723854.0, "reward": 1.567307710647583, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 524.625, "completions/mean_terminated_length": 524.625, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.6508024349750968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.0359881108161062, "learning_rate": 6.5616491719755615e-06, "loss": 0.0014, "num_tokens": 27738067.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 295.75, "completions/mean_terminated_length": 295.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6509869027854639, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.040399502497166395, "learning_rate": 6.55560222268182e-06, "loss": 0.0016, "num_tokens": 27748953.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 197.375, "completions/mean_terminated_length": 197.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.6511713705958311, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.061246199533343315, "learning_rate": 6.5495567020498985e-06, "loss": 0.0024, "num_tokens": 27754604.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6513558384061982, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.035521824611350894, "learning_rate": 6.543512612587349e-06, "loss": 0.0014, "num_tokens": 27761496.0, "reward": 1.5, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.6515403062165652, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.06446971511468291, "learning_rate": 6.537469956801128e-06, "loss": 0.0026, "num_tokens": 27766373.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 372.25, "completions/mean_terminated_length": 372.25, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6517247740269323, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.04174559086095542, "learning_rate": 6.5314287371976e-06, "loss": 0.0017, "num_tokens": 27774095.0, "reward": 1.7159091234207153, "reward_std": 0.03214121237397194, "rewards/fixed_code_pass_all_test_reward/mean": 0.7159091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.03214123100042343, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 392.375, "completions/mean_terminated_length": 392.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.6519092418372994, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.04109318205155432, "learning_rate": 6.525388956282532e-06, "loss": 0.0016, "num_tokens": 27784226.0, "reward": 1.8068181276321411, "reward_std": 0.03214118629693985, "rewards/fixed_code_pass_all_test_reward/mean": 0.8068181276321411, "rewards/fixed_code_pass_all_test_reward/std": 0.03214120864868164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 324.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6520937096476664, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.04034232581034303, "learning_rate": 6.519350616561095e-06, "loss": 0.0016, "num_tokens": 27790481.0, "reward": 1.8392857313156128, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 246.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6522781774580336, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.06752640684135258, "learning_rate": 6.5133137205378614e-06, "loss": 0.0027, "num_tokens": 27800033.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6524626452684007, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.06787939136847854, "learning_rate": 6.5072782707168055e-06, "loss": 0.0027, "num_tokens": 27809050.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.6526471130787678, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.05582423764280975, "learning_rate": 6.501244269601302e-06, "loss": 0.0022, "num_tokens": 27816945.0, "reward": 1.851190447807312, "reward_std": 0.12322601675987244, "rewards/fixed_code_pass_all_test_reward/mean": 0.851190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.12322598695755005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 585.625, "completions/mean_terminated_length": 585.625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.6528315808891348, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.028178759035654366, "learning_rate": 6.495211719694122e-06, "loss": 0.0011, "num_tokens": 27836902.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 125.25, "completions/mean_terminated_length": 125.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6530160486995019, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.05635019508190453, "learning_rate": 6.489180623497442e-06, "loss": 0.0023, "num_tokens": 27840632.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 313.75, "completions/mean_terminated_length": 313.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.653200516509869, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0623541553504765, "learning_rate": 6.483150983512824e-06, "loss": 0.0025, "num_tokens": 27850102.0, "reward": 1.8409090042114258, "reward_std": 0.14097540080547333, "rewards/fixed_code_pass_all_test_reward/mean": 0.8409091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.14097541570663452, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.6533849843202362, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.05253709189128131, "learning_rate": 6.477122802241238e-06, "loss": 0.0021, "num_tokens": 27856384.0, "reward": 1.370192289352417, "reward_std": 0.17915162444114685, "rewards/fixed_code_pass_all_test_reward/mean": 0.370192289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.17915163934230804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6535694521306032, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.05246082798112184, "learning_rate": 6.471096082183039e-06, "loss": 0.0021, "num_tokens": 27864855.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.6537539199409703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.034878367790952325, "learning_rate": 6.465070825837985e-06, "loss": 0.0014, "num_tokens": 27869771.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 329.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.6539383877513374, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.05233011534437537, "learning_rate": 6.459047035705219e-06, "loss": 0.0021, "num_tokens": 27881015.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 454.75, "completions/mean_terminated_length": 454.75, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.6541228555617045, "frac_reward_zero_std": 1.0, "grad_norm": 0.04931640625, "kl": 0.02858925610780716, "learning_rate": 6.453024714283278e-06, "loss": 0.0011, "num_tokens": 27889621.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6543073233720715, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.0787665881216526, "learning_rate": 6.4470038640700936e-06, "loss": 0.0032, "num_tokens": 27894008.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 205.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6544917911824386, "frac_reward_zero_std": 1.0, "grad_norm": 0.35546875, "kl": 0.05967084248550236, "learning_rate": 6.440984487562984e-06, "loss": 0.0024, "num_tokens": 27898497.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.6546762589928058, "frac_reward_zero_std": 1.0, "grad_norm": 0.373046875, "kl": 0.10649725468829274, "learning_rate": 6.434966587258656e-06, "loss": 0.0043, "num_tokens": 27908750.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.6548607268031729, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.06746480870060623, "learning_rate": 6.428950165653204e-06, "loss": 0.0027, "num_tokens": 27912970.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 314.0, "completions/mean_terminated_length": 314.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6550451946135399, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.04613600950688124, "learning_rate": 6.422935225242112e-06, "loss": 0.0018, "num_tokens": 27919578.0, "reward": 1.0625, "reward_std": 0.02042248472571373, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.02042250521481037, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.655229662423907, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.06515448587015271, "learning_rate": 6.416921768520243e-06, "loss": 0.0026, "num_tokens": 27925123.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6554141302342741, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.08847881108522415, "learning_rate": 6.410909797981856e-06, "loss": 0.0035, "num_tokens": 27929826.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.6555985980446412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.050857752561569214, "learning_rate": 6.4048993161205785e-06, "loss": 0.002, "num_tokens": 27934679.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 375.25, "completions/mean_terminated_length": 375.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6557830658550083, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.03138178319204599, "learning_rate": 6.398890325429437e-06, "loss": 0.0013, "num_tokens": 27944809.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6559675336653754, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.05614235857501626, "learning_rate": 6.392882828400825e-06, "loss": 0.0022, "num_tokens": 27953701.0, "reward": 1.5, "reward_std": 0.3125182092189789, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.3125182092189789, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.6561520014757425, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.06578756729140878, "learning_rate": 6.386876827526522e-06, "loss": 0.0026, "num_tokens": 27961527.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 456.0, "completions/mean_terminated_length": 456.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6563364692861096, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.05165751092135906, "learning_rate": 6.380872325297692e-06, "loss": 0.0021, "num_tokens": 27975743.0, "reward": 1.9409722089767456, "reward_std": 0.16695578396320343, "rewards/fixed_code_pass_all_test_reward/mean": 0.9409722089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.16695575416088104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.6565209370964766, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.08702937653288245, "learning_rate": 6.374869324204868e-06, "loss": 0.0035, "num_tokens": 27984669.0, "reward": 1.5, "reward_std": 0.33785802125930786, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.03244692087173462, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 556.75, "completions/mean_terminated_length": 556.75, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.6567054049068437, "frac_reward_zero_std": 1.0, "grad_norm": 0.271484375, "kl": 0.05579887889325619, "learning_rate": 6.3688678267379725e-06, "loss": 0.0022, "num_tokens": 27997595.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.6568898727172109, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.06588817061856389, "learning_rate": 6.362867835386287e-06, "loss": 0.0026, "num_tokens": 28005275.0, "reward": 1.9427083730697632, "reward_std": 0.16204527020454407, "rewards/fixed_code_pass_all_test_reward/mean": 0.9427083730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.16204530000686646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 449.625, "completions/mean_terminated_length": 449.625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.657074340527578, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.03670031763613224, "learning_rate": 6.356869352638487e-06, "loss": 0.0015, "num_tokens": 28020520.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 367.875, "completions/mean_terminated_length": 367.875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.657258808337945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0361328125, "kl": 0.02018467744346708, "learning_rate": 6.350872380982606e-06, "loss": 0.0008, "num_tokens": 28028607.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 406.125, "completions/mean_terminated_length": 406.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6574432761483121, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.03384855599142611, "learning_rate": 6.344876922906064e-06, "loss": 0.0014, "num_tokens": 28036232.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 419.125, "completions/mean_terminated_length": 419.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6576277439586792, "frac_reward_zero_std": 1.0, "grad_norm": 0.25390625, "kl": 0.05883052013814449, "learning_rate": 6.338882980895651e-06, "loss": 0.0024, "num_tokens": 28045233.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 239.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.6578122117690463, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0335363355698064, "learning_rate": 6.332890557437516e-06, "loss": 0.0013, "num_tokens": 28051546.0, "reward": 1.9892241954803467, "reward_std": 0.030478745698928833, "rewards/fixed_code_pass_all_test_reward/mean": 0.9892241358757019, "rewards/fixed_code_pass_all_test_reward/std": 0.030478745698928833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 237.0, "completions/mean_terminated_length": 237.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6579966795794134, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.14821938378736377, "learning_rate": 6.326899655017193e-06, "loss": 0.0059, "num_tokens": 28062354.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 387.0, "completions/mean_terminated_length": 387.0, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.6581811473897805, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.058919940143823624, "learning_rate": 6.320910276119576e-06, "loss": 0.0024, "num_tokens": 28073218.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6583656152001476, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.043378704925999045, "learning_rate": 6.3149224232289285e-06, "loss": 0.0017, "num_tokens": 28080801.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 185.5, "completions/mean_terminated_length": 185.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.6585500830105147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.027890507830306888, "learning_rate": 6.308936098828888e-06, "loss": 0.0011, "num_tokens": 28085005.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 330.375, "completions/mean_terminated_length": 330.375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.6587345508208817, "frac_reward_zero_std": 1.0, "grad_norm": 0.498046875, "kl": 0.03858129505533725, "learning_rate": 6.302951305402447e-06, "loss": 0.0015, "num_tokens": 28095328.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 141.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.6589190186312488, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.05368033179547638, "learning_rate": 6.2969680454319724e-06, "loss": 0.0021, "num_tokens": 28099223.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 395.0, "completions/mean_terminated_length": 395.0, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.659103486441616, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.031397609622217715, "learning_rate": 6.290986321399188e-06, "loss": 0.0013, "num_tokens": 28107615.0, "reward": 1.5892856121063232, "reward_std": 0.1937432438135147, "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.19374322891235352, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.659287954251983, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.08035092707723379, "learning_rate": 6.285006135785188e-06, "loss": 0.0032, "num_tokens": 28116145.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.6594724220623501, "frac_reward_zero_std": 0.0, "grad_norm": 3.359375, "kl": 0.07521246559917927, "learning_rate": 6.27902749107042e-06, "loss": 0.003, "num_tokens": 28123082.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6596568898727172, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.05624475609511137, "learning_rate": 6.273050389734701e-06, "loss": 0.0022, "num_tokens": 28128164.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6598413576830843, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.06282939692027867, "learning_rate": 6.267074834257199e-06, "loss": 0.0025, "num_tokens": 28137617.0, "reward": 1.5985915660858154, "reward_std": 0.03610537573695183, "rewards/fixed_code_pass_all_test_reward/mean": 0.5985915660858154, "rewards/fixed_code_pass_all_test_reward/std": 0.036105360835790634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6600258254934513, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.07256766175851226, "learning_rate": 6.261100827116449e-06, "loss": 0.0029, "num_tokens": 28145338.0, "reward": 1.8454545736312866, "reward_std": 0.2861625850200653, "rewards/fixed_code_pass_all_test_reward/mean": 0.8454545736312866, "rewards/fixed_code_pass_all_test_reward/std": 0.2861625850200653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 232.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.6602102933038185, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "kl": 0.10055622039362788, "learning_rate": 6.2551283707903375e-06, "loss": 0.004, "num_tokens": 28154036.0, "reward": 1.6749999523162842, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 392.0, "completions/mean_terminated_length": 392.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6603947611141856, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.05927482293918729, "learning_rate": 6.249157467756111e-06, "loss": 0.0024, "num_tokens": 28165572.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 326.625, "completions/mean_terminated_length": 326.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.6605792289245527, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.077272217720747, "learning_rate": 6.2431881204903735e-06, "loss": 0.0031, "num_tokens": 28171001.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 709.375, "completions/mean_terminated_length": 709.375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.6607636967349197, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.037959101668093354, "learning_rate": 6.237220331469078e-06, "loss": 0.0015, "num_tokens": 28189276.0, "reward": 1.5535714626312256, "reward_std": 0.26929885149002075, "rewards/fixed_code_pass_all_test_reward/mean": 0.5535714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.26929885149002075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 502.0, "completions/mean_terminated_length": 502.0, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.6609481645452868, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.031173185328952968, "learning_rate": 6.231254103167536e-06, "loss": 0.0012, "num_tokens": 28200860.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6611326323556539, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.060378691647201777, "learning_rate": 6.225289438060412e-06, "loss": 0.0024, "num_tokens": 28211257.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6613171001660211, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0637946268543601, "learning_rate": 6.2193263386217175e-06, "loss": 0.0026, "num_tokens": 28217332.0, "reward": 1.453125, "reward_std": 0.3402828276157379, "rewards/fixed_code_pass_all_test_reward/mean": 0.453125, "rewards/fixed_code_pass_all_test_reward/std": 0.3402828574180603, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 247.625, "completions/mean_terminated_length": 247.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.6615015679763881, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.05433705938048661, "learning_rate": 6.213364807324817e-06, "loss": 0.0022, "num_tokens": 28222993.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 284.75, "completions/mean_terminated_length": 284.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6616860357867552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0439453125, "kl": 0.019800420734100044, "learning_rate": 6.207404846642429e-06, "loss": 0.0008, "num_tokens": 28228783.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.6618705035971223, "frac_reward_zero_std": 1.0, "grad_norm": 0.24609375, "kl": 0.09030718728899956, "learning_rate": 6.201446459046614e-06, "loss": 0.0036, "num_tokens": 28237476.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 328.875, "completions/mean_terminated_length": 328.875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6620549714074894, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.02824643743224442, "learning_rate": 6.19548964700878e-06, "loss": 0.0011, "num_tokens": 28242859.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 289.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6622394392178564, "frac_reward_zero_std": 1.0, "grad_norm": 0.55859375, "kl": 0.09657092636916786, "learning_rate": 6.189534412999688e-06, "loss": 0.0039, "num_tokens": 28253047.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 358.875, "completions/mean_terminated_length": 358.875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.6624239070282236, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.03226871544029564, "learning_rate": 6.183580759489437e-06, "loss": 0.0013, "num_tokens": 28260974.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6626083748385907, "frac_reward_zero_std": 1.0, "grad_norm": 0.51953125, "kl": 0.11698120273649693, "learning_rate": 6.177628688947479e-06, "loss": 0.0047, "num_tokens": 28270136.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6627928426489578, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.04745996627025306, "learning_rate": 6.171678203842597e-06, "loss": 0.0019, "num_tokens": 28275357.0, "reward": 1.8249999284744263, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.8250000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 305.0, "completions/mean_terminated_length": 305.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6629773104593248, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.044338847626931965, "learning_rate": 6.165729306642931e-06, "loss": 0.0018, "num_tokens": 28282077.0, "reward": 1.98369562625885, "reward_std": 0.046115659177303314, "rewards/fixed_code_pass_all_test_reward/mean": 0.9836956262588501, "rewards/fixed_code_pass_all_test_reward/std": 0.04611567035317421, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 372.0, "completions/mean_terminated_length": 372.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6631617782696919, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.04194019688293338, "learning_rate": 6.1597819998159506e-06, "loss": 0.0017, "num_tokens": 28290197.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.663346246080059, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.04583764914423227, "learning_rate": 6.1538362858284716e-06, "loss": 0.0018, "num_tokens": 28295402.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 287.875, "completions/mean_terminated_length": 287.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6635307138904262, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.05538696609437466, "learning_rate": 6.147892167146648e-06, "loss": 0.0022, "num_tokens": 28305353.0, "reward": 1.7083333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 297.875, "completions/mean_terminated_length": 297.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6637151817007932, "frac_reward_zero_std": 1.0, "grad_norm": 0.2041015625, "kl": 0.026862247264944017, "learning_rate": 6.141949646235972e-06, "loss": 0.0011, "num_tokens": 28312688.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6638996495111603, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.06219944078475237, "learning_rate": 6.136008725561273e-06, "loss": 0.0025, "num_tokens": 28319133.0, "reward": 1.431318759918213, "reward_std": 0.19157744944095612, "rewards/fixed_code_pass_all_test_reward/mean": 0.4313187301158905, "rewards/fixed_code_pass_all_test_reward/std": 0.19157744944095612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 361.125, "completions/mean_terminated_length": 361.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6640841173215274, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.06612764555029571, "learning_rate": 6.130069407586714e-06, "loss": 0.0026, "num_tokens": 28331814.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 309.875, "completions/mean_terminated_length": 309.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6642685851318945, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.06591458315961063, "learning_rate": 6.124131694775801e-06, "loss": 0.0026, "num_tokens": 28340853.0, "reward": 1.274999976158142, "reward_std": 0.0235702283680439, "rewards/fixed_code_pass_all_test_reward/mean": 0.27500003576278687, "rewards/fixed_code_pass_all_test_reward/std": 0.0235702246427536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.6644530529422615, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.04079571028705686, "learning_rate": 6.118195589591362e-06, "loss": 0.0016, "num_tokens": 28346397.0, "reward": 1.5641447305679321, "reward_std": 0.14523793756961823, "rewards/fixed_code_pass_all_test_reward/mean": 0.5641447305679321, "rewards/fixed_code_pass_all_test_reward/std": 0.14523793756961823, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 216.875, "completions/mean_terminated_length": 216.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6646375207526287, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.06592871993780136, "learning_rate": 6.11226109449557e-06, "loss": 0.0026, "num_tokens": 28355876.0, "reward": 1.472426414489746, "reward_std": 0.21317189931869507, "rewards/fixed_code_pass_all_test_reward/mean": 0.47242647409439087, "rewards/fixed_code_pass_all_test_reward/std": 0.21317189931869507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 584.0, "completions/mean_terminated_length": 584.0, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.6648219885629958, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.019576584396418184, "learning_rate": 6.106328211949928e-06, "loss": 0.0008, "num_tokens": 28372972.0, "reward": 1.3125, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6650064563733629, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.054448296665214, "learning_rate": 6.100396944415261e-06, "loss": 0.0022, "num_tokens": 28376935.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6651909241837299, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.037257086602039635, "learning_rate": 6.0944672943517355e-06, "loss": 0.0015, "num_tokens": 28381431.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 358.0, "completions/mean_terminated_length": 358.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.665375391994097, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.06335990922525525, "learning_rate": 6.0885392642188415e-06, "loss": 0.0025, "num_tokens": 28390487.0, "reward": 1.75, "reward_std": 0.32732683420181274, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.32732683420181274, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 467.625, "completions/mean_terminated_length": 467.625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6655598598044641, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.020919817965477705, "learning_rate": 6.082612856475397e-06, "loss": 0.0008, "num_tokens": 28398516.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6657443276148313, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.06584582105278969, "learning_rate": 6.076688073579547e-06, "loss": 0.0026, "num_tokens": 28403013.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6659287954251983, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.05186172411777079, "learning_rate": 6.070764917988767e-06, "loss": 0.0021, "num_tokens": 28408170.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.6661132632355654, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.05480237607844174, "learning_rate": 6.064843392159852e-06, "loss": 0.0022, "num_tokens": 28415716.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.6662977310459325, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.0686554741114378, "learning_rate": 6.058923498548919e-06, "loss": 0.0027, "num_tokens": 28424126.0, "reward": 1.9956896305084229, "reward_std": 0.01219149399548769, "rewards/fixed_code_pass_all_test_reward/mean": 0.9956896305084229, "rewards/fixed_code_pass_all_test_reward/std": 0.012191502377390862, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 248.75, "completions/mean_terminated_length": 248.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.6664821988562996, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.07793403137475252, "learning_rate": 6.053005239611418e-06, "loss": 0.0031, "num_tokens": 28428980.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.07185703376308084, "learning_rate": 6.047088617802113e-06, "loss": 0.0029, "num_tokens": 28434189.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6668511344770337, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.08029084093868732, "learning_rate": 6.0411736355750905e-06, "loss": 0.0032, "num_tokens": 28439146.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6670356022874009, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.043232835829257965, "learning_rate": 6.035260295383756e-06, "loss": 0.0017, "num_tokens": 28444482.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 433.125, "completions/mean_terminated_length": 433.125, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.667220070097768, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.04560354514978826, "learning_rate": 6.02934859968084e-06, "loss": 0.0018, "num_tokens": 28453203.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.667404537908135, "frac_reward_zero_std": 1.0, "grad_norm": 0.484375, "kl": 0.08094275649636984, "learning_rate": 6.023438550918379e-06, "loss": 0.0032, "num_tokens": 28458390.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.6675890057185021, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.03306874609552324, "learning_rate": 6.017530151547742e-06, "loss": 0.0013, "num_tokens": 28463303.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 504.375, "completions/mean_terminated_length": 504.375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.6677734735288692, "frac_reward_zero_std": 1.0, "grad_norm": 0.2216796875, "kl": 0.03904478088952601, "learning_rate": 6.011623404019601e-06, "loss": 0.0016, "num_tokens": 28472386.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6679579413392362, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.07097699400037527, "learning_rate": 6.005718310783946e-06, "loss": 0.0028, "num_tokens": 28479585.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 615.125, "completions/mean_terminated_length": 615.125, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.6681424091496034, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.0342242824845016, "learning_rate": 5.999814874290084e-06, "loss": 0.0014, "num_tokens": 28491354.0, "reward": 1.6458332538604736, "reward_std": 0.7261416912078857, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4266657531261444, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 340.75, "completions/mean_terminated_length": 340.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.6683268769599705, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.032899942016229033, "learning_rate": 5.993913096986632e-06, "loss": 0.0013, "num_tokens": 28497904.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 377.375, "completions/mean_terminated_length": 377.375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6685113447703376, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.06439146166667342, "learning_rate": 5.9880129813215215e-06, "loss": 0.0026, "num_tokens": 28504491.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.6686958125807047, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.08246763818897307, "learning_rate": 5.982114529741991e-06, "loss": 0.0033, "num_tokens": 28512298.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6688802803910717, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.054147657472640276, "learning_rate": 5.976217744694594e-06, "loss": 0.0022, "num_tokens": 28516499.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 242.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.6690647482014388, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.06728681246750057, "learning_rate": 5.970322628625185e-06, "loss": 0.0027, "num_tokens": 28524502.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.669249216011806, "frac_reward_zero_std": 1.0, "grad_norm": 0.04296875, "kl": 0.04910304304212332, "learning_rate": 5.964429183978935e-06, "loss": 0.002, "num_tokens": 28530569.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 422.0, "completions/mean_terminated_length": 422.0, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.669433683822173, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.04672361444681883, "learning_rate": 5.958537413200315e-06, "loss": 0.0019, "num_tokens": 28540961.0, "reward": 1.8181817531585693, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 500.75, "completions/mean_terminated_length": 500.75, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.6696181516325401, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.029290123202372342, "learning_rate": 5.9526473187331065e-06, "loss": 0.0012, "num_tokens": 28550423.0, "reward": 1.1500000953674316, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 362.5, "completions/mean_terminated_length": 362.5, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.6698026194429072, "frac_reward_zero_std": 1.0, "grad_norm": 0.059814453125, "kl": 0.035521154874004424, "learning_rate": 5.946758903020393e-06, "loss": 0.0014, "num_tokens": 28562571.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6699870872532743, "frac_reward_zero_std": 1.0, "grad_norm": 0.337890625, "kl": 0.08464078651741147, "learning_rate": 5.940872168504559e-06, "loss": 0.0034, "num_tokens": 28566610.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 241.875, "completions/mean_terminated_length": 241.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.6701715550636413, "frac_reward_zero_std": 1.0, "grad_norm": 0.1767578125, "kl": 0.07394789811223745, "learning_rate": 5.934987117627299e-06, "loss": 0.003, "num_tokens": 28571473.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 278.875, "completions/mean_terminated_length": 278.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6703560228740085, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.06680607795715332, "learning_rate": 5.929103752829602e-06, "loss": 0.0027, "num_tokens": 28580280.0, "reward": 1.7666666507720947, "reward_std": 0.09428088366985321, "rewards/fixed_code_pass_all_test_reward/mean": 0.7666666507720947, "rewards/fixed_code_pass_all_test_reward/std": 0.0942808985710144, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6705404906843756, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.04847661918029189, "learning_rate": 5.923222076551763e-06, "loss": 0.0019, "num_tokens": 28585594.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 349.375, "completions/mean_terminated_length": 349.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6707249584947427, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.05466028000228107, "learning_rate": 5.9173420912333715e-06, "loss": 0.0022, "num_tokens": 28593245.0, "reward": 1.0729166269302368, "reward_std": 0.20623943209648132, "rewards/fixed_code_pass_all_test_reward/mean": 0.0729166641831398, "rewards/fixed_code_pass_all_test_reward/std": 0.2062394767999649, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6709094263051097, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.0432259445078671, "learning_rate": 5.911463799313323e-06, "loss": 0.0017, "num_tokens": 28598214.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.6710938941154768, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.043215092620812356, "learning_rate": 5.905587203229801e-06, "loss": 0.0017, "num_tokens": 28604094.0, "reward": 1.6666667461395264, "reward_std": 0.4714045226573944, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.4714045524597168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.6712783619258439, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.07415771996602416, "learning_rate": 5.899712305420296e-06, "loss": 0.003, "num_tokens": 28614076.0, "reward": 1.708984375, "reward_std": 0.01657281443476677, "rewards/fixed_code_pass_all_test_reward/mean": 0.708984375, "rewards/fixed_code_pass_all_test_reward/std": 0.01657281443476677, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 311.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6714628297362111, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.07051830971613526, "learning_rate": 5.893839108321584e-06, "loss": 0.0028, "num_tokens": 28622165.0, "reward": 1.234375, "reward_std": 0.6527329087257385, "rewards/fixed_code_pass_all_test_reward/mean": 0.359375, "rewards/fixed_code_pass_all_test_reward/std": 0.4454006254673004, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6716472975465781, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.051126609556376934, "learning_rate": 5.887967614369741e-06, "loss": 0.002, "num_tokens": 28628246.0, "reward": 1.4134615659713745, "reward_std": 0.16023807227611542, "rewards/fixed_code_pass_all_test_reward/mean": 0.4134615361690521, "rewards/fixed_code_pass_all_test_reward/std": 0.1602380871772766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6718317653569452, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.05822719680145383, "learning_rate": 5.882097826000137e-06, "loss": 0.0023, "num_tokens": 28636770.0, "reward": 0.8125, "reward_std": 0.5044445395469666, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6720162331673123, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.06167027959600091, "learning_rate": 5.876229745647432e-06, "loss": 0.0025, "num_tokens": 28643607.0, "reward": 1.5499999523162842, "reward_std": 0.2777460515499115, "rewards/fixed_code_pass_all_test_reward/mean": 0.550000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2777460217475891, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 255.375, "completions/mean_terminated_length": 255.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6722007009776794, "frac_reward_zero_std": 1.0, "grad_norm": 0.05908203125, "kl": 0.02440280350856483, "learning_rate": 5.87036337574558e-06, "loss": 0.001, "num_tokens": 28648786.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.6723851687880464, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.08689741510897875, "learning_rate": 5.864498718727829e-06, "loss": 0.0035, "num_tokens": 28656071.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6725696365984136, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.04919885494746268, "learning_rate": 5.858635777026706e-06, "loss": 0.002, "num_tokens": 28662525.0, "reward": 1.375, "reward_std": 0.3918818533420563, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.39188191294670105, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 615.75, "completions/mean_terminated_length": 615.75, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.6727541044087807, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224609375, "kl": 0.032840518397279084, "learning_rate": 5.852774553074035e-06, "loss": 0.0013, "num_tokens": 28674291.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.6729385722191478, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.054625645745545626, "learning_rate": 5.846915049300924e-06, "loss": 0.0022, "num_tokens": 28679371.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 301.0, "completions/mean_terminated_length": 301.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6731230400295148, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.050615180749446154, "learning_rate": 5.841057268137771e-06, "loss": 0.002, "num_tokens": 28685187.0, "reward": 1.8888888359069824, "reward_std": 0.20573778450489044, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 284.875, "completions/mean_terminated_length": 284.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.6733075078398819, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.05928889336064458, "learning_rate": 5.835201212014254e-06, "loss": 0.0024, "num_tokens": 28694962.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.673491975650249, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.05035428795963526, "learning_rate": 5.829346883359341e-06, "loss": 0.002, "num_tokens": 28704502.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6736764434606162, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.08633128507062793, "learning_rate": 5.8234942846012835e-06, "loss": 0.0035, "num_tokens": 28708858.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 281.125, "completions/mean_terminated_length": 281.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.6738609112709832, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.054373232182115316, "learning_rate": 5.817643418167606e-06, "loss": 0.0022, "num_tokens": 28720155.0, "reward": 1.5, "reward_std": 0.022580957040190697, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.022580984979867935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6740453790813503, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.047444838332012296, "learning_rate": 5.811794286485124e-06, "loss": 0.0019, "num_tokens": 28726248.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.6742298468917174, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.05803643399849534, "learning_rate": 5.8059468919799355e-06, "loss": 0.0023, "num_tokens": 28730840.0, "reward": 1.975000023841858, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6744143147020845, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.056263803504407406, "learning_rate": 5.800101237077411e-06, "loss": 0.0023, "num_tokens": 28740881.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.6745987825124515, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.04804453323595226, "learning_rate": 5.794257324202201e-06, "loss": 0.0019, "num_tokens": 28746539.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1065.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 482.125, "completions/mean_terminated_length": 482.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6747832503228187, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.07247955864295363, "learning_rate": 5.7884151557782305e-06, "loss": 0.0029, "num_tokens": 28755564.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.6749677181331858, "frac_reward_zero_std": 1.0, "grad_norm": 1.4296875, "kl": 0.11998280859552324, "learning_rate": 5.782574734228713e-06, "loss": 0.0048, "num_tokens": 28759779.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6751521859435529, "frac_reward_zero_std": 0.0, "grad_norm": 3.515625, "kl": 0.27731818705797195, "learning_rate": 5.7767360619761245e-06, "loss": 0.0111, "num_tokens": 28769190.0, "reward": 1.4798387289047241, "reward_std": 0.4770447611808777, "rewards/fixed_code_pass_all_test_reward/mean": 0.6048387289047241, "rewards/fixed_code_pass_all_test_reward/std": 0.40021923184394836, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6753366537539199, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.03648492624051869, "learning_rate": 5.7708991414422235e-06, "loss": 0.0015, "num_tokens": 28774341.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.675521121564287, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.06386289233341813, "learning_rate": 5.765063975048034e-06, "loss": 0.0026, "num_tokens": 28779217.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6757055893746541, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.05165883037261665, "learning_rate": 5.759230565213856e-06, "loss": 0.0021, "num_tokens": 28786939.0, "reward": 1.8840909004211426, "reward_std": 0.07203412055969238, "rewards/fixed_code_pass_all_test_reward/mean": 0.8840909004211426, "rewards/fixed_code_pass_all_test_reward/std": 0.07203403860330582, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 512.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6758900571850213, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.04128218628466129, "learning_rate": 5.753398914359267e-06, "loss": 0.0017, "num_tokens": 28801987.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 235.375, "completions/mean_terminated_length": 235.375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.6760745249953883, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.10680602863430977, "learning_rate": 5.74756902490311e-06, "loss": 0.0043, "num_tokens": 28824678.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 402.25, "completions/mean_terminated_length": 402.25, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6762589928057554, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.07412908971309662, "learning_rate": 5.741740899263495e-06, "loss": 0.003, "num_tokens": 28831856.0, "reward": 1.6666667461395264, "reward_std": 0.7126966714859009, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 296.0, "completions/mean_terminated_length": 296.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.6764434606161225, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.06862268527038395, "learning_rate": 5.735914539857799e-06, "loss": 0.0027, "num_tokens": 28842464.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 736.0, "completions/mean_terminated_length": 736.0, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.6766279284264896, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.041288231033831835, "learning_rate": 5.730089949102676e-06, "loss": 0.0017, "num_tokens": 28859232.0, "reward": 1.1607142686843872, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 247.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6768123962368566, "frac_reward_zero_std": 1.0, "grad_norm": 0.057373046875, "kl": 0.028180686116684228, "learning_rate": 5.724267129414039e-06, "loss": 0.0011, "num_tokens": 28866896.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 301.125, "completions/mean_terminated_length": 301.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6769968640472238, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.06084687262773514, "learning_rate": 5.718446083207068e-06, "loss": 0.0024, "num_tokens": 28877961.0, "reward": 1.2333333492279053, "reward_std": 0.2760262191295624, "rewards/fixed_code_pass_all_test_reward/mean": 0.23333334922790527, "rewards/fixed_code_pass_all_test_reward/std": 0.2760262191295624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.6771813318575909, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.07275045453570783, "learning_rate": 5.712626812896203e-06, "loss": 0.0029, "num_tokens": 28883814.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.677365799667958, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.04179192893207073, "learning_rate": 5.706809320895157e-06, "loss": 0.0017, "num_tokens": 28889081.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 296.375, "completions/mean_terminated_length": 296.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.677550267478325, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.06221320200711489, "learning_rate": 5.7009936096168985e-06, "loss": 0.0025, "num_tokens": 28897516.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 345.5, "completions/mean_terminated_length": 345.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6777347352886921, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0680410722270608, "learning_rate": 5.6951796814736596e-06, "loss": 0.0027, "num_tokens": 28907256.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 401.375, "completions/mean_terminated_length": 401.375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6779192030990592, "frac_reward_zero_std": 1.0, "grad_norm": 0.053466796875, "kl": 0.04143703880254179, "learning_rate": 5.6893675388769305e-06, "loss": 0.0017, "num_tokens": 28915339.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 531.5, "completions/mean_terminated_length": 531.5, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.6781036709094264, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.04322336078621447, "learning_rate": 5.68355718423746e-06, "loss": 0.0017, "num_tokens": 28924447.0, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.6782881387197934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.04622936458326876, "learning_rate": 5.6777486199652645e-06, "loss": 0.0018, "num_tokens": 28933270.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.6784726065301605, "frac_reward_zero_std": 1.0, "grad_norm": 0.6484375, "kl": 0.10957567766308784, "learning_rate": 5.671941848469606e-06, "loss": 0.0044, "num_tokens": 28937646.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 411.5, "completions/mean_terminated_length": 411.5, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.6786570743405276, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.0324166010832414, "learning_rate": 5.6661368721590116e-06, "loss": 0.0013, "num_tokens": 28948290.0, "reward": 1.138157844543457, "reward_std": 0.055824220180511475, "rewards/fixed_code_pass_all_test_reward/mean": 0.1381578892469406, "rewards/fixed_code_pass_all_test_reward/std": 0.055824216455221176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6788415421508947, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.07841106643900275, "learning_rate": 5.660333693441253e-06, "loss": 0.0031, "num_tokens": 28955293.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 230.75, "completions/mean_terminated_length": 230.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6790260099612617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.04000744456425309, "learning_rate": 5.654532314723371e-06, "loss": 0.0016, "num_tokens": 28960667.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 415.25, "completions/mean_terminated_length": 415.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6792104777716288, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.05734253628179431, "learning_rate": 5.648732738411651e-06, "loss": 0.0023, "num_tokens": 28971245.0, "reward": 1.2287232875823975, "reward_std": 0.5531622171401978, "rewards/fixed_code_pass_all_test_reward/mean": 0.353723406791687, "rewards/fixed_code_pass_all_test_reward/std": 0.2827094793319702, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.679394945581996, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.03147875558352098, "learning_rate": 5.642934966911627e-06, "loss": 0.0013, "num_tokens": 28976149.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 358.125, "completions/mean_terminated_length": 358.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.679579413392363, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.048423262778669596, "learning_rate": 5.637139002628095e-06, "loss": 0.0019, "num_tokens": 28986222.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 223.75, "completions/mean_terminated_length": 223.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6797638812027301, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.04327237163670361, "learning_rate": 5.6313448479650944e-06, "loss": 0.0017, "num_tokens": 28995628.0, "reward": 1.5686619281768799, "reward_std": 0.08809857815504074, "rewards/fixed_code_pass_all_test_reward/mean": 0.5686619281768799, "rewards/fixed_code_pass_all_test_reward/std": 0.08809857070446014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 552.375, "completions/mean_terminated_length": 552.375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.6799483490130972, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.01599673309829086, "learning_rate": 5.625552505325911e-06, "loss": 0.0006, "num_tokens": 29009119.0, "reward": 1.9043209552764893, "reward_std": 0.2706211507320404, "rewards/fixed_code_pass_all_test_reward/mean": 0.9043209552764893, "rewards/fixed_code_pass_all_test_reward/std": 0.270621120929718, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 301.0, "completions/mean_terminated_length": 301.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6801328168234643, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.16279565321747214, "learning_rate": 5.619761977113092e-06, "loss": 0.0065, "num_tokens": 29015935.0, "reward": 1.8478261232376099, "reward_std": 0.3509179353713989, "rewards/fixed_code_pass_all_test_reward/mean": 0.9728261232376099, "rewards/fixed_code_pass_all_test_reward/std": 0.07685943692922592, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.6803172846338313, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.04793447197880596, "learning_rate": 5.613973265728416e-06, "loss": 0.0019, "num_tokens": 29020131.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6805017524441985, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.0428868243470788, "learning_rate": 5.608186373572922e-06, "loss": 0.0017, "num_tokens": 29024752.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6806862202545656, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.05971455294638872, "learning_rate": 5.602401303046876e-06, "loss": 0.0024, "num_tokens": 29035505.0, "reward": 1.2621359825134277, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.26213592290878296, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6808706880649327, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.08328622160479426, "learning_rate": 5.596618056549815e-06, "loss": 0.0033, "num_tokens": 29041108.0, "reward": 1.8958332538604736, "reward_std": 0.294627845287323, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 284.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.6810551558752997, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.06715378211811185, "learning_rate": 5.5908366364804975e-06, "loss": 0.0027, "num_tokens": 29051232.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6812396236856668, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.057141649071127176, "learning_rate": 5.585057045236932e-06, "loss": 0.0023, "num_tokens": 29057731.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 295.375, "completions/mean_terminated_length": 295.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.6814240914960339, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.03981076891068369, "learning_rate": 5.579279285216369e-06, "loss": 0.0016, "num_tokens": 29064206.0, "reward": 1.814655065536499, "reward_std": 0.00798118021339178, "rewards/fixed_code_pass_all_test_reward/mean": 0.8146551847457886, "rewards/fixed_code_pass_all_test_reward/std": 0.007981226779520512, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 369.0, "completions/mean_terminated_length": 369.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6816085593064011, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.05680990917608142, "learning_rate": 5.573503358815294e-06, "loss": 0.0023, "num_tokens": 29077230.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 347.125, "completions/mean_terminated_length": 347.125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.6817930271167681, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.05372508568689227, "learning_rate": 5.567729268429445e-06, "loss": 0.0021, "num_tokens": 29088703.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.6819774949271352, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.05247478582896292, "learning_rate": 5.561957016453785e-06, "loss": 0.0021, "num_tokens": 29094452.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.6821619627375023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.049421800998970866, "learning_rate": 5.556186605282521e-06, "loss": 0.002, "num_tokens": 29098686.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 177.625, "completions/mean_terminated_length": 177.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.6823464305478694, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.0704203862696886, "learning_rate": 5.550418037309089e-06, "loss": 0.0028, "num_tokens": 29106619.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.6825308983582364, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.07070517959073186, "learning_rate": 5.544651314926176e-06, "loss": 0.0028, "num_tokens": 29117109.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.6827153661686036, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.06761193252168596, "learning_rate": 5.538886440525689e-06, "loss": 0.0027, "num_tokens": 29122499.0, "reward": 1.8300000429153442, "reward_std": 0.28425338864326477, "rewards/fixed_code_pass_all_test_reward/mean": 0.8300000429153442, "rewards/fixed_code_pass_all_test_reward/std": 0.28425338864326477, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 276.875, "completions/mean_terminated_length": 276.875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6828998339789707, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.04865336720831692, "learning_rate": 5.533123416498774e-06, "loss": 0.0019, "num_tokens": 29133994.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 315.25, "completions/mean_terminated_length": 315.25, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6830843017893378, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.04105960892047733, "learning_rate": 5.5273622452358055e-06, "loss": 0.0016, "num_tokens": 29139316.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 546.5, "completions/mean_terminated_length": 546.5, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.6832687695997048, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.026640623109415174, "learning_rate": 5.5216029291264e-06, "loss": 0.0011, "num_tokens": 29155640.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6834532374100719, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.02422093745553866, "learning_rate": 5.515845470559393e-06, "loss": 0.001, "num_tokens": 29161739.0, "reward": 1.7916666269302368, "reward_std": 0.3857583999633789, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.3857583701610565, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.683637705220439, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.04771150159649551, "learning_rate": 5.5100898719228544e-06, "loss": 0.0019, "num_tokens": 29169229.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 358.75, "completions/mean_terminated_length": 358.75, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6838221730308062, "frac_reward_zero_std": 1.0, "grad_norm": 0.244140625, "kl": 0.07577046100050211, "learning_rate": 5.504336135604084e-06, "loss": 0.003, "num_tokens": 29178755.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6840066408411732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.05911656154785305, "learning_rate": 5.4985842639896e-06, "loss": 0.0024, "num_tokens": 29188537.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 276.875, "completions/mean_terminated_length": 276.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6841911086515403, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.052113022189587355, "learning_rate": 5.492834259465165e-06, "loss": 0.0021, "num_tokens": 29194816.0, "reward": 1.5852272510528564, "reward_std": 0.6646929383277893, "rewards/fixed_code_pass_all_test_reward/mean": 0.7102272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.3374827802181244, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 492.125, "completions/mean_terminated_length": 492.125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.6843755764619074, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.04601561976596713, "learning_rate": 5.487086124415752e-06, "loss": 0.0018, "num_tokens": 29203905.0, "reward": 1.2589285373687744, "reward_std": 0.07576141506433487, "rewards/fixed_code_pass_all_test_reward/mean": 0.2589285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.07576145231723785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.6845600442722745, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.08568951953202486, "learning_rate": 5.481339861225565e-06, "loss": 0.0034, "num_tokens": 29212172.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.6847445120826415, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06819872977212071, "learning_rate": 5.4755954722780236e-06, "loss": 0.0027, "num_tokens": 29219733.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 385.75, "completions/mean_terminated_length": 385.75, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6849289798930087, "frac_reward_zero_std": 1.0, "grad_norm": 0.4140625, "kl": 0.08384104445576668, "learning_rate": 5.4698529599557836e-06, "loss": 0.0034, "num_tokens": 29227499.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.6851134477033758, "frac_reward_zero_std": 1.0, "grad_norm": 0.345703125, "kl": 0.07650069566443563, "learning_rate": 5.4641123266407135e-06, "loss": 0.0031, "num_tokens": 29238878.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.6852979155137429, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.07223155302926898, "learning_rate": 5.4583735747139024e-06, "loss": 0.0029, "num_tokens": 29248293.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.6854823833241099, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.07371826632879674, "learning_rate": 5.452636706555662e-06, "loss": 0.0029, "num_tokens": 29254744.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 458.375, "completions/mean_terminated_length": 458.375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.685666851134477, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.028940978343598545, "learning_rate": 5.446901724545516e-06, "loss": 0.0012, "num_tokens": 29264307.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 746.75, "completions/mean_terminated_length": 746.75, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.6858513189448441, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.036706829676404595, "learning_rate": 5.4411686310622196e-06, "loss": 0.0015, "num_tokens": 29281393.0, "reward": 1.314814805984497, "reward_std": 0.32288143038749695, "rewards/fixed_code_pass_all_test_reward/mean": 0.31481480598449707, "rewards/fixed_code_pass_all_test_reward/std": 0.32288140058517456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6860357867552113, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.04220913257449865, "learning_rate": 5.435437428483733e-06, "loss": 0.0017, "num_tokens": 29289521.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.6862202545655783, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.05426789750345051, "learning_rate": 5.4297081191872334e-06, "loss": 0.0022, "num_tokens": 29295131.0, "reward": 1.5, "reward_std": 0.4140393137931824, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.41403937339782715, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6864047223759454, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.052670068107545376, "learning_rate": 5.4239807055491135e-06, "loss": 0.0021, "num_tokens": 29304804.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6865891901863125, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.06710272375494242, "learning_rate": 5.418255189944985e-06, "loss": 0.0027, "num_tokens": 29312508.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 225.375, "completions/mean_terminated_length": 225.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.6867736579966796, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.07392398873344064, "learning_rate": 5.412531574749665e-06, "loss": 0.003, "num_tokens": 29321631.0, "reward": 1.7763158082962036, "reward_std": 0.0074432059191167355, "rewards/fixed_code_pass_all_test_reward/mean": 0.7763158082962036, "rewards/fixed_code_pass_all_test_reward/std": 0.0074432180263102055, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6869581258070466, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.06531312875449657, "learning_rate": 5.406809862337188e-06, "loss": 0.0026, "num_tokens": 29328893.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6871425936174138, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.036794677027501166, "learning_rate": 5.401090055080797e-06, "loss": 0.0015, "num_tokens": 29335227.0, "reward": 1.5923912525177002, "reward_std": 0.4984109401702881, "rewards/fixed_code_pass_all_test_reward/mean": 0.592391312122345, "rewards/fixed_code_pass_all_test_reward/std": 0.4984109401702881, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 516.0, "completions/mean_terminated_length": 516.0, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.6873270614277809, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.028193362639285624, "learning_rate": 5.3953721553529425e-06, "loss": 0.0011, "num_tokens": 29347915.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 266.75, "completions/mean_terminated_length": 266.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.687511529238148, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.054948396515101194, "learning_rate": 5.389656165525281e-06, "loss": 0.0022, "num_tokens": 29356305.0, "reward": 1.5202702283859253, "reward_std": 0.14856088161468506, "rewards/fixed_code_pass_all_test_reward/mean": 0.5202702283859253, "rewards/fixed_code_pass_all_test_reward/std": 0.14856086671352386, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 326.875, "completions/mean_terminated_length": 326.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.687695997048515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.03456807276234031, "learning_rate": 5.383942087968693e-06, "loss": 0.0014, "num_tokens": 29362168.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 407.5, "completions/mean_terminated_length": 407.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.6878804648588821, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.06868772464804351, "learning_rate": 5.3782299250532475e-06, "loss": 0.0027, "num_tokens": 29371700.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 278.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6880649326692492, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.07078847102820873, "learning_rate": 5.372519679148227e-06, "loss": 0.0028, "num_tokens": 29380437.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.6882494004796164, "frac_reward_zero_std": 0.0, "grad_norm": 3.515625, "kl": 0.04879022645764053, "learning_rate": 5.366811352622114e-06, "loss": 0.002, "num_tokens": 29387854.0, "reward": 1.921875, "reward_std": 0.11995559185743332, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.1199556216597557, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6884338682899834, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.04299047729000449, "learning_rate": 5.361104947842609e-06, "loss": 0.0017, "num_tokens": 29393967.0, "reward": 1.5688775777816772, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.6938775777816772, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 567.5, "completions/mean_terminated_length": 567.5, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.6886183361003505, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.037408398813568056, "learning_rate": 5.355400467176599e-06, "loss": 0.0015, "num_tokens": 29407891.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.6888028039107176, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05943103181198239, "learning_rate": 5.349697912990181e-06, "loss": 0.0024, "num_tokens": 29416026.0, "reward": 1.6375000476837158, "reward_std": 0.0824958086013794, "rewards/fixed_code_pass_all_test_reward/mean": 0.6375000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.0824957937002182, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 195.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6889872717210846, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.06428057886660099, "learning_rate": 5.343997287648647e-06, "loss": 0.0026, "num_tokens": 29422807.0, "reward": 1.2808642387390137, "reward_std": 0.6413220763206482, "rewards/fixed_code_pass_all_test_reward/mean": 0.4058641791343689, "rewards/fixed_code_pass_all_test_reward/std": 0.4127139747142792, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.6891717395314517, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.042539784335531294, "learning_rate": 5.3382985935165e-06, "loss": 0.0017, "num_tokens": 29426708.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 294.0, "completions/mean_terminated_length": 294.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6893562073418189, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.024263361294288188, "learning_rate": 5.332601832957434e-06, "loss": 0.001, "num_tokens": 29438564.0, "reward": 1.5609502792358398, "reward_std": 0.018446864560246468, "rewards/fixed_code_pass_all_test_reward/mean": 0.5609503984451294, "rewards/fixed_code_pass_all_test_reward/std": 0.018446870148181915, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.689540675152186, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.04766351403668523, "learning_rate": 5.326907008334339e-06, "loss": 0.0019, "num_tokens": 29444079.0, "reward": 1.0714285373687744, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 236.375, "completions/mean_terminated_length": 236.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.689725142962553, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.10774441296234727, "learning_rate": 5.321214122009306e-06, "loss": 0.0043, "num_tokens": 29452226.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6899096107729201, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.046697352547198534, "learning_rate": 5.31552317634362e-06, "loss": 0.0019, "num_tokens": 29457139.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6900940785832872, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.05536888213828206, "learning_rate": 5.309834173697768e-06, "loss": 0.0022, "num_tokens": 29467078.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 338.375, "completions/mean_terminated_length": 338.375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6902785463936543, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.02770042442716658, "learning_rate": 5.304147116431422e-06, "loss": 0.0011, "num_tokens": 29474625.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.6904630142040215, "frac_reward_zero_std": 0.0, "grad_norm": 3.421875, "kl": 0.15039126202464104, "learning_rate": 5.298462006903451e-06, "loss": 0.006, "num_tokens": 29479481.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 174.125, "completions/mean_terminated_length": 174.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.6906474820143885, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.13166656531393528, "learning_rate": 5.292778847471911e-06, "loss": 0.0053, "num_tokens": 29485546.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 240.0, "completions/mean_terminated_length": 240.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6908319498247556, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.052873983047902584, "learning_rate": 5.287097640494061e-06, "loss": 0.0021, "num_tokens": 29491770.0, "reward": 1.850000023841858, "reward_std": 0.2777460217475891, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2777460217475891, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.6910164176351227, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.0782367642968893, "learning_rate": 5.28141838832634e-06, "loss": 0.0031, "num_tokens": 29496232.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 332.625, "completions/mean_terminated_length": 332.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.6912008854454897, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.03656643151771277, "learning_rate": 5.275741093324381e-06, "loss": 0.0015, "num_tokens": 29503293.0, "reward": 1.4722223281860352, "reward_std": 0.18332402408123016, "rewards/fixed_code_pass_all_test_reward/mean": 0.4722222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.18332397937774658, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.6913853532558568, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.11156094563193619, "learning_rate": 5.270065757843e-06, "loss": 0.0045, "num_tokens": 29511672.0, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.6915698210662239, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.04610870219767094, "learning_rate": 5.264392384236201e-06, "loss": 0.0018, "num_tokens": 29518783.0, "reward": 1.234375, "reward_std": 0.30935922265052795, "rewards/fixed_code_pass_all_test_reward/mean": 0.234375, "rewards/fixed_code_pass_all_test_reward/std": 0.30935922265052795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.6917542888765911, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.033242873614653945, "learning_rate": 5.258720974857186e-06, "loss": 0.0013, "num_tokens": 29525166.0, "reward": 1.7340425252914429, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7340425252914429, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 463.375, "completions/mean_terminated_length": 463.375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.6919387566869581, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.030349210457643494, "learning_rate": 5.253051532058329e-06, "loss": 0.0012, "num_tokens": 29538913.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 621.875, "completions/mean_terminated_length": 621.875, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.6921232244973252, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.0293849689187482, "learning_rate": 5.247384058191189e-06, "loss": 0.0012, "num_tokens": 29554496.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6923076923076923, "frac_reward_zero_std": 1.0, "grad_norm": 0.265625, "kl": 0.05948563851416111, "learning_rate": 5.241718555606512e-06, "loss": 0.0024, "num_tokens": 29560918.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6924921601180594, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.09347476530820131, "learning_rate": 5.236055026654232e-06, "loss": 0.0037, "num_tokens": 29571322.0, "reward": 1.8624999523162842, "reward_std": 0.16201849281787872, "rewards/fixed_code_pass_all_test_reward/mean": 0.862500011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.16201850771903992, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6926766279284264, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.062359289964661, "learning_rate": 5.230393473683455e-06, "loss": 0.0025, "num_tokens": 29577569.0, "reward": 1.71875, "reward_std": 0.405046284198761, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.2893187701702118, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 260.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6928610957387936, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.055326757952570915, "learning_rate": 5.2247338990424715e-06, "loss": 0.0022, "num_tokens": 29583904.0, "reward": 1.2321429252624512, "reward_std": 0.5862242579460144, "rewards/fixed_code_pass_all_test_reward/mean": 0.3571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3967800438404083, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6930455635491607, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.09187700040638447, "learning_rate": 5.21907630507875e-06, "loss": 0.0037, "num_tokens": 29596944.0, "reward": 1.9431817531585693, "reward_std": 0.16070610284805298, "rewards/fixed_code_pass_all_test_reward/mean": 0.9431818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.16070608794689178, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.6932300313595278, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.14633216802030802, "learning_rate": 5.213420694138934e-06, "loss": 0.0059, "num_tokens": 29600712.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 617.25, "completions/mean_terminated_length": 617.25, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.6934144991698948, "frac_reward_zero_std": 0.0, "grad_norm": 0.7890625, "kl": 0.031754052615724504, "learning_rate": 5.207767068568859e-06, "loss": 0.0013, "num_tokens": 29612714.0, "reward": 1.7132352590560913, "reward_std": 0.13129833340644836, "rewards/fixed_code_pass_all_test_reward/mean": 0.7132352590560913, "rewards/fixed_code_pass_all_test_reward/std": 0.13129831850528717, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 107.125, "completions/mean_terminated_length": 107.125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.6935989669802619, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.045623639365658164, "learning_rate": 5.202115430713519e-06, "loss": 0.0018, "num_tokens": 29616499.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 205.75, "completions/mean_terminated_length": 205.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.693783434790629, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.0337809135671705, "learning_rate": 5.196465782917095e-06, "loss": 0.0014, "num_tokens": 29624273.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 345.5, "completions/mean_terminated_length": 345.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6939679026009962, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.028950452688150108, "learning_rate": 5.190818127522932e-06, "loss": 0.0012, "num_tokens": 29631685.0, "reward": 1.2916667461395264, "reward_std": 0.1178511381149292, "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 386.5, "completions/mean_terminated_length": 386.5, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.6941523704113632, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.045047470135614276, "learning_rate": 5.185172466873563e-06, "loss": 0.0018, "num_tokens": 29639001.0, "reward": 1.7321428060531616, "reward_std": 0.36967799067497253, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.36967799067497253, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.6943368382217303, "frac_reward_zero_std": 0.0, "grad_norm": 3.90625, "kl": 0.06510244705714285, "learning_rate": 5.179528803310686e-06, "loss": 0.0026, "num_tokens": 29647916.0, "reward": 1.3382353782653809, "reward_std": 0.13176806271076202, "rewards/fixed_code_pass_all_test_reward/mean": 0.3382352888584137, "rewards/fixed_code_pass_all_test_reward/std": 0.1317680925130844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.6945213060320974, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.05810054042376578, "learning_rate": 5.1738871391751696e-06, "loss": 0.0023, "num_tokens": 29654761.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 581.25, "completions/mean_terminated_length": 581.25, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.6947057738424645, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.03720660298131406, "learning_rate": 5.168247476807054e-06, "loss": 0.0015, "num_tokens": 29670323.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6948902416528315, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.03811023477464914, "learning_rate": 5.162609818545545e-06, "loss": 0.0015, "num_tokens": 29677074.0, "reward": 1.8571429252624512, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.6950747094631987, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.055335769546218216, "learning_rate": 5.156974166729028e-06, "loss": 0.0022, "num_tokens": 29682788.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.6952591772735658, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.059920014813542366, "learning_rate": 5.151340523695049e-06, "loss": 0.0024, "num_tokens": 29693065.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.6954436450839329, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.06982100894674659, "learning_rate": 5.145708891780319e-06, "loss": 0.0028, "num_tokens": 29697012.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.6956281128942999, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.06550692580640316, "learning_rate": 5.140079273320716e-06, "loss": 0.0026, "num_tokens": 29704861.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 223.75, "completions/mean_terminated_length": 223.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.695812580704667, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.06347026210278273, "learning_rate": 5.134451670651284e-06, "loss": 0.0025, "num_tokens": 29713371.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 232.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.6959970485150341, "frac_reward_zero_std": 1.0, "grad_norm": 0.6484375, "kl": 0.11070303479209542, "learning_rate": 5.128826086106236e-06, "loss": 0.0044, "num_tokens": 29722041.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.6961815163254013, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.05848913500085473, "learning_rate": 5.1232025220189405e-06, "loss": 0.0023, "num_tokens": 29731144.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.6963659841357683, "frac_reward_zero_std": 0.0, "grad_norm": 3.734375, "kl": 0.13553006760776043, "learning_rate": 5.11758098072193e-06, "loss": 0.0054, "num_tokens": 29735378.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.6965504519461354, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.067637640517205, "learning_rate": 5.111961464546895e-06, "loss": 0.0027, "num_tokens": 29740749.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 538.25, "completions/mean_terminated_length": 538.25, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.6967349197565025, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.03371901670470834, "learning_rate": 5.1063439758246955e-06, "loss": 0.0013, "num_tokens": 29755767.0, "reward": 1.6022727489471436, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 259.75, "completions/mean_terminated_length": 259.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6969193875668696, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.08619596855714917, "learning_rate": 5.100728516885342e-06, "loss": 0.0034, "num_tokens": 29763669.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6971038553772366, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.06468289438635111, "learning_rate": 5.095115090058007e-06, "loss": 0.0026, "num_tokens": 29771224.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 578.625, "completions/mean_terminated_length": 578.625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.6972883231876038, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.06839769287034869, "learning_rate": 5.0895036976710164e-06, "loss": 0.0027, "num_tokens": 29781029.0, "reward": 1.8571428060531616, "reward_std": 0.27532121539115906, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.2753211557865143, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 136.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6974727909979709, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.04741918505169451, "learning_rate": 5.083894342051852e-06, "loss": 0.0019, "num_tokens": 29784868.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 320.75, "completions/mean_terminated_length": 320.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.697657258808338, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.056176312384195626, "learning_rate": 5.078287025527161e-06, "loss": 0.0022, "num_tokens": 29791754.0, "reward": 1.5568182468414307, "reward_std": 0.22498852014541626, "rewards/fixed_code_pass_all_test_reward/mean": 0.5568181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.22498852014541626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 222.0, "completions/mean_terminated_length": 222.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.697841726618705, "frac_reward_zero_std": 1.0, "grad_norm": 0.2734375, "kl": 0.07487614452838898, "learning_rate": 5.072681750422732e-06, "loss": 0.003, "num_tokens": 29801986.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.6980261944290721, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.07849258184432983, "learning_rate": 5.067078519063514e-06, "loss": 0.0031, "num_tokens": 29809232.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 260.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6982106622394392, "frac_reward_zero_std": 1.0, "grad_norm": 0.041259765625, "kl": 0.04374441667459905, "learning_rate": 5.0614773337736015e-06, "loss": 0.0017, "num_tokens": 29815911.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.6983951300498064, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.10232554189860821, "learning_rate": 5.0558781968762525e-06, "loss": 0.0041, "num_tokens": 29820652.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6985795978601734, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.07022060872986913, "learning_rate": 5.050281110693866e-06, "loss": 0.0028, "num_tokens": 29831903.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.6987640656705405, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0633476059883833, "learning_rate": 5.04468607754799e-06, "loss": 0.0025, "num_tokens": 29840916.0, "reward": 1.345588207244873, "reward_std": 0.283745676279068, "rewards/fixed_code_pass_all_test_reward/mean": 0.3455882668495178, "rewards/fixed_code_pass_all_test_reward/std": 0.283745676279068, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1157.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 382.375, "completions/mean_terminated_length": 382.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6989485334809076, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.0634287754073739, "learning_rate": 5.039093099759325e-06, "loss": 0.0025, "num_tokens": 29851719.0, "reward": 0.9000000357627869, "reward_std": 0.5855400562286377, "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.20701968669891357, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 195.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6991330012912746, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.0986030288040638, "learning_rate": 5.033502179647713e-06, "loss": 0.0039, "num_tokens": 29860239.0, "reward": 1.1399999856948853, "reward_std": 0.3500204086303711, "rewards/fixed_code_pass_all_test_reward/mean": 0.14000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.3500204086303711, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.6993174691016417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.06545897433534265, "learning_rate": 5.027913319532156e-06, "loss": 0.0026, "num_tokens": 29868767.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 415.5, "completions/mean_terminated_length": 415.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6995019369120089, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.05586144560948014, "learning_rate": 5.022326521730787e-06, "loss": 0.0022, "num_tokens": 29880355.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.699686404722376, "frac_reward_zero_std": 1.0, "grad_norm": 0.045654296875, "kl": 0.03437668434344232, "learning_rate": 5.016741788560889e-06, "loss": 0.0014, "num_tokens": 29888297.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.699870872532743, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.055603883462026715, "learning_rate": 5.011159122338887e-06, "loss": 0.0022, "num_tokens": 29894468.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.7000553403431101, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.05095011508092284, "learning_rate": 5.005578525380355e-06, "loss": 0.002, "num_tokens": 29901395.0, "reward": 1.9144736528396606, "reward_std": 0.24190497398376465, "rewards/fixed_code_pass_all_test_reward/mean": 0.9144736528396606, "rewards/fixed_code_pass_all_test_reward/std": 0.24190495908260345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7002398081534772, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.03615976503351703, "learning_rate": 5.000000000000003e-06, "loss": 0.0014, "num_tokens": 29910500.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7004242759638443, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.06909260433167219, "learning_rate": 4.994423548511681e-06, "loss": 0.0028, "num_tokens": 29917846.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.7006087437742115, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.07205517683178186, "learning_rate": 4.988849173228377e-06, "loss": 0.0029, "num_tokens": 29927295.0, "reward": 1.7307692766189575, "reward_std": 0.3916930854320526, "rewards/fixed_code_pass_all_test_reward/mean": 0.7307692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.391693115234375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7007932115845785, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.057606622111052275, "learning_rate": 4.983276876462231e-06, "loss": 0.0023, "num_tokens": 29932761.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7009776793949456, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0589928631670773, "learning_rate": 4.977706660524506e-06, "loss": 0.0024, "num_tokens": 29941410.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 154.25, "completions/mean_terminated_length": 154.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7011621472053127, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.07135063922032714, "learning_rate": 4.972138527725607e-06, "loss": 0.0029, "num_tokens": 29949540.0, "reward": 1.8794642686843872, "reward_std": 0.34092649817466736, "rewards/fixed_code_pass_all_test_reward/mean": 0.8794642686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.34092649817466736, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.7013466150156797, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.04719691805075854, "learning_rate": 4.966572480375076e-06, "loss": 0.0019, "num_tokens": 29954523.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 715.625, "completions/mean_terminated_length": 715.625, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.7015310828260468, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.026991162914782763, "learning_rate": 4.961008520781585e-06, "loss": 0.0011, "num_tokens": 29967584.0, "reward": 1.1875, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 542.125, "completions/mean_terminated_length": 542.125, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.701715550636414, "frac_reward_zero_std": 1.0, "grad_norm": 0.23828125, "kl": 0.04407649394124746, "learning_rate": 4.9554466512529486e-06, "loss": 0.0018, "num_tokens": 29978505.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7019000184467811, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.07208435330539942, "learning_rate": 4.949886874096114e-06, "loss": 0.0029, "num_tokens": 29984372.0, "reward": 1.5416666269302368, "reward_std": 0.40079084038734436, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.36176151037216187, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.7020844862571481, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.06490298011340201, "learning_rate": 4.944329191617152e-06, "loss": 0.0026, "num_tokens": 29988144.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.7022689540675152, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.04161426890641451, "learning_rate": 4.938773606121271e-06, "loss": 0.0017, "num_tokens": 29997264.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 235.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7024534218778823, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.04504646954592317, "learning_rate": 4.933220119912802e-06, "loss": 0.0018, "num_tokens": 30002207.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7026378896882494, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.06024576909840107, "learning_rate": 4.92766873529522e-06, "loss": 0.0024, "num_tokens": 30010387.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 264.125, "completions/mean_terminated_length": 264.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7028223574986165, "frac_reward_zero_std": 1.0, "grad_norm": 0.41796875, "kl": 0.10475434269756079, "learning_rate": 4.922119454571116e-06, "loss": 0.0042, "num_tokens": 30018596.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7030068253089836, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.04600129171740264, "learning_rate": 4.91657228004221e-06, "loss": 0.0018, "num_tokens": 30024286.0, "reward": 1.47826087474823, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.47826087474823, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.7031912931193507, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.0862234125379473, "learning_rate": 4.911027214009352e-06, "loss": 0.0034, "num_tokens": 30028265.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.7033757609297178, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.05400510667823255, "learning_rate": 4.905484258772511e-06, "loss": 0.0022, "num_tokens": 30036981.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.7035602287400848, "frac_reward_zero_std": 1.0, "grad_norm": 0.201171875, "kl": 0.05119851580820978, "learning_rate": 4.899943416630795e-06, "loss": 0.002, "num_tokens": 30040805.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 408.625, "completions/mean_terminated_length": 408.625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.7037446965504519, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.05168400565162301, "learning_rate": 4.894404689882417e-06, "loss": 0.0021, "num_tokens": 30049538.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.703929164360819, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06208220776170492, "learning_rate": 4.888868080824726e-06, "loss": 0.0025, "num_tokens": 30057809.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 398.5, "completions/mean_terminated_length": 398.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7041136321711862, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.05932723404839635, "learning_rate": 4.883333591754181e-06, "loss": 0.0024, "num_tokens": 30065725.0, "reward": 1.1617646217346191, "reward_std": 0.0816899761557579, "rewards/fixed_code_pass_all_test_reward/mean": 0.1617647111415863, "rewards/fixed_code_pass_all_test_reward/std": 0.08169001340866089, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.7042980999815532, "frac_reward_zero_std": 1.0, "grad_norm": 0.18359375, "kl": 0.060865233186632395, "learning_rate": 4.8778012249663785e-06, "loss": 0.0024, "num_tokens": 30069671.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 359.75, "completions/mean_terminated_length": 359.75, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.7044825677919203, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.05004762695170939, "learning_rate": 4.87227098275602e-06, "loss": 0.002, "num_tokens": 30077221.0, "reward": 1.9615384340286255, "reward_std": 0.07121694087982178, "rewards/fixed_code_pass_all_test_reward/mean": 0.9615384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.07121692597866058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 308.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7046670356022874, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.07509279565420002, "learning_rate": 4.86674286741693e-06, "loss": 0.003, "num_tokens": 30087534.0, "reward": 1.3068181276321411, "reward_std": 0.04704994708299637, "rewards/fixed_code_pass_all_test_reward/mean": 0.3068181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.04704992473125458, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 225.25, "completions/mean_terminated_length": 225.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7048515034126545, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.05984076298773289, "learning_rate": 4.861216881242051e-06, "loss": 0.0024, "num_tokens": 30094832.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7050359712230215, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.0855354133527726, "learning_rate": 4.855693026523442e-06, "loss": 0.0034, "num_tokens": 30102389.0, "reward": 1.7027027606964111, "reward_std": 0.36375492811203003, "rewards/fixed_code_pass_all_test_reward/mean": 0.7027026414871216, "rewards/fixed_code_pass_all_test_reward/std": 0.36375492811203003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.7052204390333887, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.061318673426285386, "learning_rate": 4.850171305552282e-06, "loss": 0.0025, "num_tokens": 30109574.0, "reward": 1.6363636255264282, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7054049068437558, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.03405877610202879, "learning_rate": 4.844651720618859e-06, "loss": 0.0014, "num_tokens": 30113833.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.7055893746541229, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.07373270811513066, "learning_rate": 4.8391342740125806e-06, "loss": 0.0029, "num_tokens": 30121662.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 175.625, "completions/mean_terminated_length": 175.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7057738424644899, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.07060714345425367, "learning_rate": 4.833618968021957e-06, "loss": 0.0028, "num_tokens": 30128867.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 237.0, "completions/mean_terminated_length": 237.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.705958310274857, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.05259980587288737, "learning_rate": 4.828105804934629e-06, "loss": 0.0021, "num_tokens": 30138507.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 336.375, "completions/mean_terminated_length": 336.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7061427780852241, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.05982100265100598, "learning_rate": 4.8225947870373305e-06, "loss": 0.0024, "num_tokens": 30145174.0, "reward": 1.9900000095367432, "reward_std": 0.018516384065151215, "rewards/fixed_code_pass_all_test_reward/mean": 0.9900000095367432, "rewards/fixed_code_pass_all_test_reward/std": 0.018516412004828453, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7063272458955913, "frac_reward_zero_std": 1.0, "grad_norm": 0.060546875, "kl": 0.033244481310248375, "learning_rate": 4.817085916615914e-06, "loss": 0.0013, "num_tokens": 30152202.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.7065117137059583, "frac_reward_zero_std": 0.0, "grad_norm": 3.421875, "kl": 0.06785973580554128, "learning_rate": 4.811579195955337e-06, "loss": 0.0027, "num_tokens": 30159783.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.7066961815163254, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.03047541220439598, "learning_rate": 4.8060746273396755e-06, "loss": 0.0012, "num_tokens": 30168135.0, "reward": 1.2083333730697632, "reward_std": 0.08384613692760468, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.08384616672992706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 309.5, "completions/mean_terminated_length": 309.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7068806493266925, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.04607432149350643, "learning_rate": 4.800572213052101e-06, "loss": 0.0019, "num_tokens": 30177355.0, "reward": 1.033653974533081, "reward_std": 0.013598186895251274, "rewards/fixed_code_pass_all_test_reward/mean": 0.03365384787321091, "rewards/fixed_code_pass_all_test_reward/std": 0.01359820831567049, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7070651171370596, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.05989663559012115, "learning_rate": 4.795071955374897e-06, "loss": 0.0024, "num_tokens": 30182803.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 342.5, "completions/mean_terminated_length": 342.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.7072495849474266, "frac_reward_zero_std": 1.0, "grad_norm": 0.322265625, "kl": 0.07401546568144113, "learning_rate": 4.789573856589453e-06, "loss": 0.003, "num_tokens": 30194503.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 355.75, "completions/mean_terminated_length": 355.75, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.7074340527577938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0380859375, "kl": 0.021791439736261964, "learning_rate": 4.784077918976254e-06, "loss": 0.0009, "num_tokens": 30202349.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 110.625, "completions/mean_terminated_length": 110.625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.7076185205681609, "frac_reward_zero_std": 1.0, "grad_norm": 0.248046875, "kl": 0.04544575000181794, "learning_rate": 4.778584144814907e-06, "loss": 0.0018, "num_tokens": 30206106.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.707802988378528, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.05921050673350692, "learning_rate": 4.7730925363841075e-06, "loss": 0.0024, "num_tokens": 30214047.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.707987456188895, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.04872401594184339, "learning_rate": 4.767603095961652e-06, "loss": 0.0019, "num_tokens": 30220235.0, "reward": 1.8478260040283203, "reward_std": 0.2827281057834625, "rewards/fixed_code_pass_all_test_reward/mean": 0.8478260636329651, "rewards/fixed_code_pass_all_test_reward/std": 0.2827281355857849, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7081719239992621, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.08397746505215764, "learning_rate": 4.762115825824444e-06, "loss": 0.0034, "num_tokens": 30226749.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7083563918096292, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.030361522920429707, "learning_rate": 4.756630728248488e-06, "loss": 0.0012, "num_tokens": 30233312.0, "reward": 1.6774193048477173, "reward_std": 0.22701022028923035, "rewards/fixed_code_pass_all_test_reward/mean": 0.6774193048477173, "rewards/fixed_code_pass_all_test_reward/std": 0.22701019048690796, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7085408596199964, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0806163758970797, "learning_rate": 4.751147805508881e-06, "loss": 0.0032, "num_tokens": 30240491.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 439.25, "completions/mean_terminated_length": 439.25, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.7087253274303634, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.03740382706746459, "learning_rate": 4.745667059879822e-06, "loss": 0.0015, "num_tokens": 30251341.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 260.25, "completions/mean_terminated_length": 260.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7089097952407305, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.06140184123069048, "learning_rate": 4.7401884936346e-06, "loss": 0.0025, "num_tokens": 30260815.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.7090942630510976, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.04486780695151538, "learning_rate": 4.7347121090456165e-06, "loss": 0.0018, "num_tokens": 30268643.0, "reward": 1.8392856121063232, "reward_std": 0.3198815882205963, "rewards/fixed_code_pass_all_test_reward/mean": 0.8392857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.3198816478252411, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.7092787308614646, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.03009818948339671, "learning_rate": 4.729237908384347e-06, "loss": 0.0012, "num_tokens": 30275089.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 483.125, "completions/mean_terminated_length": 483.125, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.7094631986718317, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.03452628676313907, "learning_rate": 4.723765893921382e-06, "loss": 0.0014, "num_tokens": 30285090.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.7096476664821989, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.048722162027843297, "learning_rate": 4.71829606792639e-06, "loss": 0.0019, "num_tokens": 30290454.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 542.625, "completions/mean_terminated_length": 542.625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.709832134292566, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.05910768685862422, "learning_rate": 4.712828432668137e-06, "loss": 0.0024, "num_tokens": 30303779.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.710016602102933, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.06292694457806647, "learning_rate": 4.707362990414476e-06, "loss": 0.0025, "num_tokens": 30310736.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7102010699133001, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.039967420510947704, "learning_rate": 4.701899743432361e-06, "loss": 0.0016, "num_tokens": 30317233.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.7103855377236672, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.03829885902814567, "learning_rate": 4.696438693987826e-06, "loss": 0.0015, "num_tokens": 30322889.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 157.875, "completions/mean_terminated_length": 157.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7105700055340343, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.09790154546499252, "learning_rate": 4.690979844345997e-06, "loss": 0.0039, "num_tokens": 30329784.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 260.25, "completions/mean_terminated_length": 260.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7107544733444014, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.02936880965717137, "learning_rate": 4.685523196771086e-06, "loss": 0.0012, "num_tokens": 30334586.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 232.875, "completions/mean_terminated_length": 232.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.7109389411547685, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.07717095990665257, "learning_rate": 4.6800687535263886e-06, "loss": 0.0031, "num_tokens": 30339361.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7111234089651356, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.05082589271478355, "learning_rate": 4.674616516874299e-06, "loss": 0.002, "num_tokens": 30347712.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7113078767755027, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.06389410188421607, "learning_rate": 4.669166489076283e-06, "loss": 0.0026, "num_tokens": 30353963.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 357.0, "completions/mean_terminated_length": 357.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7114923445858697, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.042827249970287085, "learning_rate": 4.663718672392894e-06, "loss": 0.0017, "num_tokens": 30361267.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 164.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7116768123962368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.029614152386784554, "learning_rate": 4.658273069083764e-06, "loss": 0.0012, "num_tokens": 30365526.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 399.5, "completions/mean_terminated_length": 399.5, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.711861280206604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.03307322628097609, "learning_rate": 4.652829681407621e-06, "loss": 0.0013, "num_tokens": 30373738.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7120457480169711, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.04743895772844553, "learning_rate": 4.64738851162226e-06, "loss": 0.0019, "num_tokens": 30379133.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7122302158273381, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.0435379846021533, "learning_rate": 4.6419495619845615e-06, "loss": 0.0017, "num_tokens": 30383640.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7124146836377052, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.05272874073125422, "learning_rate": 4.63651283475048e-06, "loss": 0.0021, "num_tokens": 30389258.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7125991514480723, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.05097152921371162, "learning_rate": 4.631078332175058e-06, "loss": 0.002, "num_tokens": 30395078.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 291.75, "completions/mean_terminated_length": 291.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7127836192584394, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.08158058137632906, "learning_rate": 4.62564605651241e-06, "loss": 0.0033, "num_tokens": 30405820.0, "reward": 1.8055555820465088, "reward_std": 0.28327882289886475, "rewards/fixed_code_pass_all_test_reward/mean": 0.8055555820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.2832788825035095, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.7129680870688065, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.0234702160814777, "learning_rate": 4.620216010015725e-06, "loss": 0.0009, "num_tokens": 30414631.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.7131525548791736, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.06160775059834123, "learning_rate": 4.614788194937268e-06, "loss": 0.0025, "num_tokens": 30422426.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 323.75, "completions/mean_terminated_length": 323.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.7133370226895407, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.06860989215783775, "learning_rate": 4.609362613528375e-06, "loss": 0.0027, "num_tokens": 30431176.0, "reward": 1.8489583730697632, "reward_std": 0.3506920039653778, "rewards/fixed_code_pass_all_test_reward/mean": 0.8489583730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3506919741630554, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7135214904999078, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.06968801887705922, "learning_rate": 4.6039392680394715e-06, "loss": 0.0028, "num_tokens": 30440535.0, "reward": 1.9272727966308594, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9272727370262146, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 218.5, "completions/mean_terminated_length": 218.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7137059583102748, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.06853314768522978, "learning_rate": 4.598518160720037e-06, "loss": 0.0027, "num_tokens": 30445787.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7138904261206419, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.08003338193520904, "learning_rate": 4.59309929381863e-06, "loss": 0.0032, "num_tokens": 30454530.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7140748939310091, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.043233988573774695, "learning_rate": 4.587682669582877e-06, "loss": 0.0017, "num_tokens": 30459025.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 544.125, "completions/mean_terminated_length": 544.125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.7142593617413762, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.02047198300715536, "learning_rate": 4.5822682902594836e-06, "loss": 0.0008, "num_tokens": 30468282.0, "reward": 1.5892857313156128, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 203.25, "completions/mean_terminated_length": 203.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7144438295517432, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.06921168323606253, "learning_rate": 4.5768561580942135e-06, "loss": 0.0028, "num_tokens": 30473012.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 332.25, "completions/mean_terminated_length": 332.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7146282973621103, "frac_reward_zero_std": 1.0, "grad_norm": 0.3984375, "kl": 0.0761150571051985, "learning_rate": 4.571446275331903e-06, "loss": 0.003, "num_tokens": 30479086.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7148127651724774, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.056625811383128166, "learning_rate": 4.566038644216454e-06, "loss": 0.0023, "num_tokens": 30485019.0, "reward": 1.767045497894287, "reward_std": 0.21790771186351776, "rewards/fixed_code_pass_all_test_reward/mean": 0.7670454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.21790769696235657, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 237.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7149972329828445, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.06707597523927689, "learning_rate": 4.5606332669908305e-06, "loss": 0.0027, "num_tokens": 30489890.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 253.875, "completions/mean_terminated_length": 253.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.7151817007932116, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.03699842747300863, "learning_rate": 4.555230145897076e-06, "loss": 0.0015, "num_tokens": 30498113.0, "reward": 1.7833333015441895, "reward_std": 0.18082702159881592, "rewards/fixed_code_pass_all_test_reward/mean": 0.7833333015441895, "rewards/fixed_code_pass_all_test_reward/std": 0.18082700669765472, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 494.5, "completions/mean_terminated_length": 494.5, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.7153661686035787, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.03201985906343907, "learning_rate": 4.5498292831762825e-06, "loss": 0.0013, "num_tokens": 30512437.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 321.25, "completions/mean_terminated_length": 321.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.7155506364139458, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.034875017940066755, "learning_rate": 4.544430681068612e-06, "loss": 0.0014, "num_tokens": 30521279.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 436.25, "completions/mean_terminated_length": 436.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7157351042243129, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.05771413038019091, "learning_rate": 4.5390343418132855e-06, "loss": 0.0023, "num_tokens": 30533561.0, "reward": 1.7018229961395264, "reward_std": 0.3593345582485199, "rewards/fixed_code_pass_all_test_reward/mean": 0.7018229365348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3593345582485199, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 413.625, "completions/mean_terminated_length": 413.625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.7159195720346799, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.04173109796829522, "learning_rate": 4.533640267648593e-06, "loss": 0.0017, "num_tokens": 30542158.0, "reward": 1.8166667222976685, "reward_std": 0.1603567749261856, "rewards/fixed_code_pass_all_test_reward/mean": 0.8166666626930237, "rewards/fixed_code_pass_all_test_reward/std": 0.16035674512386322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.716104039845047, "frac_reward_zero_std": 1.0, "grad_norm": 0.01806640625, "kl": 0.014176143682561815, "learning_rate": 4.5282484608118795e-06, "loss": 0.0006, "num_tokens": 30549593.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 513.5, "completions/mean_terminated_length": 513.5, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.7162885076554141, "frac_reward_zero_std": 1.0, "grad_norm": 0.050048828125, "kl": 0.024835170013830066, "learning_rate": 4.5228589235395436e-06, "loss": 0.001, "num_tokens": 30559805.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7164729754657813, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.08842871198430657, "learning_rate": 4.517471658067058e-06, "loss": 0.0035, "num_tokens": 30567286.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 165.0, "completions/mean_terminated_length": 165.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7166574432761483, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.08727003121748567, "learning_rate": 4.512086666628934e-06, "loss": 0.0035, "num_tokens": 30571342.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 328.25, "completions/mean_terminated_length": 328.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.7168419110865154, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.04423598735593259, "learning_rate": 4.506703951458761e-06, "loss": 0.0018, "num_tokens": 30579032.0, "reward": 1.5390625, "reward_std": 0.383733868598938, "rewards/fixed_code_pass_all_test_reward/mean": 0.5390625, "rewards/fixed_code_pass_all_test_reward/std": 0.383733868598938, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.7170263788968825, "frac_reward_zero_std": 1.0, "grad_norm": 0.369140625, "kl": 0.07078360114246607, "learning_rate": 4.501323514789166e-06, "loss": 0.0028, "num_tokens": 30586053.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 287.0, "completions/mean_terminated_length": 287.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7172108467072495, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.08497605286538601, "learning_rate": 4.495945358851841e-06, "loss": 0.0034, "num_tokens": 30598525.0, "reward": 1.0560345649719238, "reward_std": 0.020085575059056282, "rewards/fixed_code_pass_all_test_reward/mean": 0.05603448674082756, "rewards/fixed_code_pass_all_test_reward/std": 0.020085599273443222, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7173953145176166, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.08513224008493125, "learning_rate": 4.490569485877527e-06, "loss": 0.0034, "num_tokens": 30607584.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7175797823279838, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.06019554054364562, "learning_rate": 4.485195898096016e-06, "loss": 0.0024, "num_tokens": 30617314.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 228.625, "completions/mean_terminated_length": 228.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7177642501383509, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.058768254704773426, "learning_rate": 4.479824597736166e-06, "loss": 0.0024, "num_tokens": 30623183.0, "reward": 1.5, "reward_std": 0.18516398966312408, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.18516403436660767, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 356.25, "completions/mean_terminated_length": 356.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.717948717948718, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.02977571077644825, "learning_rate": 4.47445558702587e-06, "loss": 0.0012, "num_tokens": 30633721.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.718133185759085, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.036806508200243115, "learning_rate": 4.469088868192078e-06, "loss": 0.0015, "num_tokens": 30639495.0, "reward": 1.2870879173278809, "reward_std": 0.2258531153202057, "rewards/fixed_code_pass_all_test_reward/mean": 0.28708791732788086, "rewards/fixed_code_pass_all_test_reward/std": 0.2258531153202057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 237.25, "completions/mean_terminated_length": 237.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.7183176535694521, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.047020606230944395, "learning_rate": 4.463724443460788e-06, "loss": 0.0019, "num_tokens": 30649529.0, "reward": 1.6107282638549805, "reward_std": 0.157289519906044, "rewards/fixed_code_pass_all_test_reward/mean": 0.6107283234596252, "rewards/fixed_code_pass_all_test_reward/std": 0.1572895050048828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.7185021213798192, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.06683598272502422, "learning_rate": 4.458362315057051e-06, "loss": 0.0027, "num_tokens": 30660750.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 244.125, "completions/mean_terminated_length": 244.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.7186865891901864, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.0729732122272253, "learning_rate": 4.45300248520496e-06, "loss": 0.0029, "num_tokens": 30665719.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.7188710570005534, "frac_reward_zero_std": 1.0, "grad_norm": 0.23828125, "kl": 0.06582375383004546, "learning_rate": 4.447644956127659e-06, "loss": 0.0026, "num_tokens": 30672887.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.7190555248109205, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.09234894486144185, "learning_rate": 4.442289730047332e-06, "loss": 0.0037, "num_tokens": 30683412.0, "reward": 1.665816307067871, "reward_std": 0.4639612138271332, "rewards/fixed_code_pass_all_test_reward/mean": 0.6658163070678711, "rewards/fixed_code_pass_all_test_reward/std": 0.46396124362945557, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7192399926212876, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.053202245151624084, "learning_rate": 4.43693680918521e-06, "loss": 0.0021, "num_tokens": 30690110.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 443.375, "completions/mean_terminated_length": 443.375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7194244604316546, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.028907809522934258, "learning_rate": 4.431586195761575e-06, "loss": 0.0012, "num_tokens": 30700673.0, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7196089282420217, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.08615269232541323, "learning_rate": 4.426237891995742e-06, "loss": 0.0034, "num_tokens": 30709197.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 597.625, "completions/mean_terminated_length": 597.625, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.7197933960523889, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.03197685023769736, "learning_rate": 4.420891900106072e-06, "loss": 0.0013, "num_tokens": 30723154.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 490.625, "completions/mean_terminated_length": 490.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.719977863862756, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.04681829549372196, "learning_rate": 4.415548222309966e-06, "loss": 0.0019, "num_tokens": 30733367.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 170.375, "completions/mean_terminated_length": 170.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.720162331673123, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.042073245625942945, "learning_rate": 4.410206860823869e-06, "loss": 0.0017, "num_tokens": 30737762.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 260.5, "completions/mean_terminated_length": 260.5, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7203467994834901, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.04409677488729358, "learning_rate": 4.404867817863261e-06, "loss": 0.0018, "num_tokens": 30745478.0, "reward": 1.6666667461395264, "reward_std": 0.4714045226573944, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.4714045524597168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7205312672938572, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.08134877448901534, "learning_rate": 4.399531095642663e-06, "loss": 0.0033, "num_tokens": 30755979.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 381.375, "completions/mean_terminated_length": 381.375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.7207157351042243, "frac_reward_zero_std": 1.0, "grad_norm": 0.060546875, "kl": 0.03833574149757624, "learning_rate": 4.3941966963756296e-06, "loss": 0.0015, "num_tokens": 30766382.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 407.125, "completions/mean_terminated_length": 407.125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7209002029145914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.03767387196421623, "learning_rate": 4.388864622274752e-06, "loss": 0.0015, "num_tokens": 30775255.0, "reward": 1.1666666269302368, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7210846707249585, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.05201926035806537, "learning_rate": 4.3835348755516675e-06, "loss": 0.0021, "num_tokens": 30785128.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.7212691385353256, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.04673083359375596, "learning_rate": 4.378207458417035e-06, "loss": 0.0019, "num_tokens": 30791230.0, "reward": 1.9270832538604736, "reward_std": 0.2062395066022873, "rewards/fixed_code_pass_all_test_reward/mean": 0.9270833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.2062394767999649, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7214536063456927, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.028265531873330474, "learning_rate": 4.372882373080552e-06, "loss": 0.0011, "num_tokens": 30802307.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.7216380741560597, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.04139554430730641, "learning_rate": 4.3675596217509475e-06, "loss": 0.0017, "num_tokens": 30806024.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7218225419664268, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.14411377650685608, "learning_rate": 4.362239206635988e-06, "loss": 0.0058, "num_tokens": 30815481.0, "reward": 1.601190447807312, "reward_std": 0.2992134094238281, "rewards/fixed_code_pass_all_test_reward/mean": 0.601190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.2992134094238281, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 551.75, "completions/mean_terminated_length": 551.75, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.722007009776794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0257568359375, "kl": 0.013423286029137671, "learning_rate": 4.356921129942465e-06, "loss": 0.0005, "num_tokens": 30829119.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.7221914775871611, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.0710270048584789, "learning_rate": 4.351605393876203e-06, "loss": 0.0028, "num_tokens": 30837031.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7223759453975281, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.06897640507668257, "learning_rate": 4.346292000642051e-06, "loss": 0.0028, "num_tokens": 30845587.0, "reward": 1.78316330909729, "reward_std": 0.20053407549858093, "rewards/fixed_code_pass_all_test_reward/mean": 0.7831632494926453, "rewards/fixed_code_pass_all_test_reward/std": 0.20053404569625854, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7225604132078952, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.051326724933460355, "learning_rate": 4.3409809524438884e-06, "loss": 0.0021, "num_tokens": 30852203.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 537.0, "completions/mean_terminated_length": 537.0, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.7227448810182623, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.03230148821603507, "learning_rate": 4.335672251484631e-06, "loss": 0.0013, "num_tokens": 30862331.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7229293488286294, "frac_reward_zero_std": 1.0, "grad_norm": 0.2060546875, "kl": 0.07391682406887412, "learning_rate": 4.330365899966209e-06, "loss": 0.003, "num_tokens": 30866315.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.7231138166389965, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.06464700261130929, "learning_rate": 4.325061900089582e-06, "loss": 0.0026, "num_tokens": 30875172.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 223.125, "completions/mean_terminated_length": 223.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.7232982844493636, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.07125143008306623, "learning_rate": 4.319760254054732e-06, "loss": 0.0029, "num_tokens": 30881421.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.7234827522597307, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.0430746590718627, "learning_rate": 4.314460964060672e-06, "loss": 0.0017, "num_tokens": 30887334.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7236672200700978, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.055299986619502306, "learning_rate": 4.309164032305431e-06, "loss": 0.0022, "num_tokens": 30897884.0, "reward": 1.8888888359069824, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 614.625, "completions/mean_terminated_length": 614.625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.7238516878804648, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.029017496621236205, "learning_rate": 4.303869460986063e-06, "loss": 0.0012, "num_tokens": 30912785.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7240361556908319, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.09584566159173846, "learning_rate": 4.2985772522986425e-06, "loss": 0.0038, "num_tokens": 30921814.0, "reward": 1.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.7242206235011991, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.0466416182462126, "learning_rate": 4.293287408438259e-06, "loss": 0.0019, "num_tokens": 30926922.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7244050913115662, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.06314856186509132, "learning_rate": 4.287999931599032e-06, "loss": 0.0025, "num_tokens": 30933982.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 352.25, "completions/mean_terminated_length": 352.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7245895591219332, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.10053978860378265, "learning_rate": 4.282714823974088e-06, "loss": 0.004, "num_tokens": 30943416.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.7247740269323003, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.02633055386831984, "learning_rate": 4.277432087755578e-06, "loss": 0.0011, "num_tokens": 30950835.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7249584947426674, "frac_reward_zero_std": 1.0, "grad_norm": 0.427734375, "kl": 0.06682678777724504, "learning_rate": 4.272151725134665e-06, "loss": 0.0027, "num_tokens": 30960558.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7251429625530345, "frac_reward_zero_std": 1.0, "grad_norm": 0.054931640625, "kl": 0.06421800423413515, "learning_rate": 4.266873738301527e-06, "loss": 0.0026, "num_tokens": 30970028.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 201.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.7253274303634016, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.03650564874988049, "learning_rate": 4.261598129445366e-06, "loss": 0.0015, "num_tokens": 30977577.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 192.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.7255118981737687, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.04418681189417839, "learning_rate": 4.256324900754386e-06, "loss": 0.0018, "num_tokens": 30981978.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.7256963659841358, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.08306942647323012, "learning_rate": 4.251054054415809e-06, "loss": 0.0033, "num_tokens": 30988193.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.7258808337945029, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06726885074749589, "learning_rate": 4.245785592615865e-06, "loss": 0.0027, "num_tokens": 30992919.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.7260653016048699, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.040368389687500894, "learning_rate": 4.2405195175398055e-06, "loss": 0.0016, "num_tokens": 30997285.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.726249769415237, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.049541141372174025, "learning_rate": 4.235255831371879e-06, "loss": 0.002, "num_tokens": 31004769.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7264342372256042, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.04935328848659992, "learning_rate": 4.2299945362953534e-06, "loss": 0.002, "num_tokens": 31014313.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 209.875, "completions/mean_terminated_length": 209.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.7266187050359713, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.042537023313343525, "learning_rate": 4.224735634492495e-06, "loss": 0.0017, "num_tokens": 31023392.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7268031728463383, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.07308513252064586, "learning_rate": 4.219479128144583e-06, "loss": 0.0029, "num_tokens": 31032985.0, "reward": 1.7678570747375488, "reward_std": 0.43003344535827637, "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.430033415555954, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7269876406567054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.06245264131575823, "learning_rate": 4.214225019431908e-06, "loss": 0.0025, "num_tokens": 31040958.0, "reward": 1.2666666507720947, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2666666805744171, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.7271721084670725, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.06570088234730065, "learning_rate": 4.2089733105337585e-06, "loss": 0.0026, "num_tokens": 31048398.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.7273565762774395, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.027797817660029978, "learning_rate": 4.203724003628429e-06, "loss": 0.0011, "num_tokens": 31055100.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7275410440878067, "frac_reward_zero_std": 1.0, "grad_norm": 0.2255859375, "kl": 0.053654917515814304, "learning_rate": 4.198477100893215e-06, "loss": 0.0021, "num_tokens": 31059866.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7277255118981738, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.05054688116069883, "learning_rate": 4.193232604504425e-06, "loss": 0.002, "num_tokens": 31064699.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7279099797085409, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.05475267581641674, "learning_rate": 4.187990516637361e-06, "loss": 0.0022, "num_tokens": 31073046.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.728094447518908, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.04386793402954936, "learning_rate": 4.182750839466327e-06, "loss": 0.0018, "num_tokens": 31083647.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.728278915329275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.046247567515820265, "learning_rate": 4.177513575164628e-06, "loss": 0.0018, "num_tokens": 31089206.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 299.875, "completions/mean_terminated_length": 299.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7284633831396421, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.028363069519400597, "learning_rate": 4.172278725904565e-06, "loss": 0.0011, "num_tokens": 31094613.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 327.25, "completions/mean_terminated_length": 327.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7286478509500092, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.04581883328501135, "learning_rate": 4.1670462938574466e-06, "loss": 0.0018, "num_tokens": 31104239.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 614.625, "completions/mean_terminated_length": 614.625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.7288323187603764, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.03610966069390997, "learning_rate": 4.161816281193569e-06, "loss": 0.0014, "num_tokens": 31118444.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7290167865707434, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.05659267259761691, "learning_rate": 4.1565886900822295e-06, "loss": 0.0023, "num_tokens": 31125163.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7292012543811105, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.04570330725982785, "learning_rate": 4.151363522691716e-06, "loss": 0.0018, "num_tokens": 31132895.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7293857221914776, "frac_reward_zero_std": 1.0, "grad_norm": 0.052734375, "kl": 0.03569060063455254, "learning_rate": 4.1461407811893225e-06, "loss": 0.0014, "num_tokens": 31138957.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7295701900018446, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.03630253789015114, "learning_rate": 4.140920467741325e-06, "loss": 0.0015, "num_tokens": 31145537.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 513.875, "completions/mean_terminated_length": 513.875, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.7297546578122117, "frac_reward_zero_std": 1.0, "grad_norm": 1.0546875, "kl": 0.04802450002171099, "learning_rate": 4.135702584512998e-06, "loss": 0.0019, "num_tokens": 31155552.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 384.75, "completions/mean_terminated_length": 384.75, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.7299391256225789, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.035032271523959935, "learning_rate": 4.130487133668602e-06, "loss": 0.0014, "num_tokens": 31167190.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 245.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.730123593432946, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.03376211109571159, "learning_rate": 4.125274117371402e-06, "loss": 0.0014, "num_tokens": 31178796.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.730308061243313, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.047599487006664276, "learning_rate": 4.120063537783639e-06, "loss": 0.0019, "num_tokens": 31185407.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 146.0, "completions/mean_terminated_length": 146.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.7304925290536801, "frac_reward_zero_std": 1.0, "grad_norm": 0.984375, "kl": 0.10075640631839633, "learning_rate": 4.1148553970665515e-06, "loss": 0.004, "num_tokens": 31189295.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.7306769968640472, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.0950032314285636, "learning_rate": 4.109649697380362e-06, "loss": 0.0038, "num_tokens": 31193654.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 3961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.7308614646744143, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.059674195712432265, "learning_rate": 4.104446440884278e-06, "loss": 0.0024, "num_tokens": 31199812.0, "reward": 1.9049999713897705, "reward_std": 0.26870059967041016, "rewards/fixed_code_pass_all_test_reward/mean": 0.9049999713897705, "rewards/fixed_code_pass_all_test_reward/std": 0.26870056986808777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7310459324847814, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.08200742537155747, "learning_rate": 4.0992456297365035e-06, "loss": 0.0033, "num_tokens": 31204442.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 346.375, "completions/mean_terminated_length": 346.375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.7312304002951485, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.05651222821325064, "learning_rate": 4.094047266094225e-06, "loss": 0.0023, "num_tokens": 31211389.0, "reward": 1.5, "reward_std": 0.35456207394599915, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.3545621335506439, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.7314148681055156, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0674587688408792, "learning_rate": 4.088851352113609e-06, "loss": 0.0027, "num_tokens": 31219301.0, "reward": 1.5125000476837158, "reward_std": 0.6111422777175903, "rewards/fixed_code_pass_all_test_reward/mean": 0.637499988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.2575888931751251, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7315993359158827, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.10197881329804659, "learning_rate": 4.0836578899498056e-06, "loss": 0.0041, "num_tokens": 31228196.0, "reward": 1.5372340679168701, "reward_std": 0.4979100227355957, "rewards/fixed_code_pass_all_test_reward/mean": 0.5372340679168701, "rewards/fixed_code_pass_all_test_reward/std": 0.4979100227355957, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.7317838037262497, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.02561659866478294, "learning_rate": 4.0784668817569514e-06, "loss": 0.001, "num_tokens": 31232467.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.7319682715366168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.04426239989697933, "learning_rate": 4.073278329688169e-06, "loss": 0.0018, "num_tokens": 31241595.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 151.75, "completions/mean_terminated_length": 151.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.732152739346984, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.06341606727801263, "learning_rate": 4.068092235895554e-06, "loss": 0.0025, "num_tokens": 31249945.0, "reward": 1.8005951642990112, "reward_std": 0.23002919554710388, "rewards/fixed_code_pass_all_test_reward/mean": 0.8005952835083008, "rewards/fixed_code_pass_all_test_reward/std": 0.23002919554710388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7323372071573511, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.07467827177606523, "learning_rate": 4.062908602530187e-06, "loss": 0.003, "num_tokens": 31257409.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 529.25, "completions/mean_terminated_length": 529.25, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.7325216749677181, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.027758415671996772, "learning_rate": 4.057727431742128e-06, "loss": 0.0011, "num_tokens": 31272035.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 389.125, "completions/mean_terminated_length": 389.125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.7327061427780852, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.032965259393677115, "learning_rate": 4.052548725680408e-06, "loss": 0.0013, "num_tokens": 31284636.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 637.25, "completions/mean_terminated_length": 637.25, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.7328906105884523, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.03681591083295643, "learning_rate": 4.047372486493054e-06, "loss": 0.0015, "num_tokens": 31305070.0, "reward": 1.3870967626571655, "reward_std": 0.26767808198928833, "rewards/fixed_code_pass_all_test_reward/mean": 0.3870967626571655, "rewards/fixed_code_pass_all_test_reward/std": 0.26767808198928833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7330750783988194, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.05072584003210068, "learning_rate": 4.042198716327051e-06, "loss": 0.002, "num_tokens": 31309275.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 523.875, "completions/mean_terminated_length": 523.875, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.7332595462091865, "frac_reward_zero_std": 1.0, "grad_norm": 0.058837890625, "kl": 0.032086371327750385, "learning_rate": 4.0370274173283696e-06, "loss": 0.0013, "num_tokens": 31319370.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7334440140195536, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.05128642520867288, "learning_rate": 4.031858591641948e-06, "loss": 0.0021, "num_tokens": 31325852.0, "reward": 1.6576087474822998, "reward_std": 0.3685132563114166, "rewards/fixed_code_pass_all_test_reward/mean": 0.657608687877655, "rewards/fixed_code_pass_all_test_reward/std": 0.3685133159160614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 356.625, "completions/mean_terminated_length": 356.625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.7336284818299207, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.027232181862927973, "learning_rate": 4.0266922414117106e-06, "loss": 0.0011, "num_tokens": 31337937.0, "reward": 1.7777777910232544, "reward_std": 0.3546947240829468, "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.3546947240829468, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.7338129496402878, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.07810239819809794, "learning_rate": 4.021528368780545e-06, "loss": 0.0031, "num_tokens": 31342781.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 217.75, "completions/mean_terminated_length": 217.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7339974174506548, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.06326031079515815, "learning_rate": 4.016366975890313e-06, "loss": 0.0025, "num_tokens": 31349123.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 399.25, "completions/mean_terminated_length": 399.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7341818852610219, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.04878048459067941, "learning_rate": 4.011208064881852e-06, "loss": 0.002, "num_tokens": 31360285.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7343663530713891, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.06772788730449975, "learning_rate": 4.006051637894958e-06, "loss": 0.0027, "num_tokens": 31367828.0, "reward": 1.7272727489471436, "reward_std": 0.16833092272281647, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.16833093762397766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7345508208817562, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.05327897402457893, "learning_rate": 4.000897697068418e-06, "loss": 0.0021, "num_tokens": 31373770.0, "reward": 1.9375, "reward_std": 0.13069948554039001, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.13069947063922882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.7347352886921232, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.05488793249242008, "learning_rate": 3.9957462445399684e-06, "loss": 0.0022, "num_tokens": 31377475.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7349197565024903, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.03953965730033815, "learning_rate": 3.9905972824463225e-06, "loss": 0.0016, "num_tokens": 31386941.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7351042243128574, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.05587677005678415, "learning_rate": 3.985450812923154e-06, "loss": 0.0022, "num_tokens": 31395224.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7352886921232245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.043744680006057024, "learning_rate": 3.980306838105114e-06, "loss": 0.0017, "num_tokens": 31400230.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.7354731599335916, "frac_reward_zero_std": 1.0, "grad_norm": 1.0546875, "kl": 0.15104632172733545, "learning_rate": 3.975165360125809e-06, "loss": 0.006, "num_tokens": 31405205.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7356576277439587, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.052310540806502104, "learning_rate": 3.970026381117813e-06, "loss": 0.0021, "num_tokens": 31411547.0, "reward": 1.7365591526031494, "reward_std": 0.3057028651237488, "rewards/fixed_code_pass_all_test_reward/mean": 0.7365591526031494, "rewards/fixed_code_pass_all_test_reward/std": 0.3057028651237488, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7358420955543258, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.04552783281542361, "learning_rate": 3.964889903212661e-06, "loss": 0.0018, "num_tokens": 31417131.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.7360265633646929, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.049833309021778405, "learning_rate": 3.959755928540859e-06, "loss": 0.002, "num_tokens": 31421365.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7362110311750599, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.04094506660476327, "learning_rate": 3.954624459231866e-06, "loss": 0.0016, "num_tokens": 31426487.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 292.0, "completions/mean_terminated_length": 292.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.736395498985427, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.07528483681380749, "learning_rate": 3.949495497414104e-06, "loss": 0.003, "num_tokens": 31431943.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.7365799667957942, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.059418728575110435, "learning_rate": 3.944369045214956e-06, "loss": 0.0024, "num_tokens": 31440989.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.7367644346061613, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.06589688058011234, "learning_rate": 3.9392451047607606e-06, "loss": 0.0026, "num_tokens": 31449459.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7369489024165283, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.05022402573376894, "learning_rate": 3.934123678176823e-06, "loss": 0.002, "num_tokens": 31460697.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 190.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.7371333702268954, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.0724384457571432, "learning_rate": 3.9290047675874e-06, "loss": 0.0029, "num_tokens": 31466064.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 466.25, "completions/mean_terminated_length": 466.25, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.7373178380372625, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.051569382194429636, "learning_rate": 3.923888375115702e-06, "loss": 0.0021, "num_tokens": 31474434.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 467.125, "completions/mean_terminated_length": 467.125, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.7375023058476295, "frac_reward_zero_std": 1.0, "grad_norm": 0.047119140625, "kl": 0.030607226421125233, "learning_rate": 3.9187745028838955e-06, "loss": 0.0012, "num_tokens": 31483691.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 411.375, "completions/mean_terminated_length": 411.375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.7376867736579967, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.031020155176520348, "learning_rate": 3.913663153013112e-06, "loss": 0.0012, "num_tokens": 31497382.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.7378712414683638, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.03623131196945906, "learning_rate": 3.908554327623425e-06, "loss": 0.0014, "num_tokens": 31503819.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 293.5, "completions/mean_terminated_length": 293.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7380557092787309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.030016994569450617, "learning_rate": 3.903448028833866e-06, "loss": 0.0012, "num_tokens": 31509615.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.738240177089098, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.05678371828980744, "learning_rate": 3.898344258762415e-06, "loss": 0.0023, "num_tokens": 31513630.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.738424644899465, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.07028057519346476, "learning_rate": 3.8932430195260055e-06, "loss": 0.0028, "num_tokens": 31517677.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 334.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.7386091127098321, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.05010122852399945, "learning_rate": 3.888144313240522e-06, "loss": 0.002, "num_tokens": 31528842.0, "reward": 1.576923131942749, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5769230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7387935805201993, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.046350294491276145, "learning_rate": 3.8830481420208026e-06, "loss": 0.0019, "num_tokens": 31534989.0, "reward": 1.975000023841858, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 326.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7389780483305664, "frac_reward_zero_std": 1.0, "grad_norm": 0.69140625, "kl": 0.06769010878633708, "learning_rate": 3.877954507980625e-06, "loss": 0.0027, "num_tokens": 31543999.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7391625161409334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.03588955360464752, "learning_rate": 3.872863413232719e-06, "loss": 0.0014, "num_tokens": 31549505.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 468.75, "completions/mean_terminated_length": 468.75, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.7393469839513005, "frac_reward_zero_std": 1.0, "grad_norm": 0.05126953125, "kl": 0.03707550931721926, "learning_rate": 3.867774859888758e-06, "loss": 0.0015, "num_tokens": 31561351.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.7395314517616676, "frac_reward_zero_std": 0.0, "grad_norm": 3.421875, "kl": 0.047840074403211474, "learning_rate": 3.86268885005937e-06, "loss": 0.0019, "num_tokens": 31566205.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 283.75, "completions/mean_terminated_length": 283.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7397159195720346, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.06625281227752566, "learning_rate": 3.857605385854117e-06, "loss": 0.0027, "num_tokens": 31572507.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 476.75, "completions/mean_terminated_length": 476.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7399003873824018, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.06483407411724329, "learning_rate": 3.852524469381514e-06, "loss": 0.0026, "num_tokens": 31584089.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7400848551927689, "frac_reward_zero_std": 0.0, "grad_norm": 4.46875, "kl": 0.03619427964440547, "learning_rate": 3.84744610274901e-06, "loss": 0.0014, "num_tokens": 31591054.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.740269323003136, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "kl": 0.04324462125077844, "learning_rate": 3.842370288063008e-06, "loss": 0.0017, "num_tokens": 31598761.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.740453790813503, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.07415290363132954, "learning_rate": 3.8372970274288435e-06, "loss": 0.003, "num_tokens": 31607402.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7406382586238701, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.04999978095293045, "learning_rate": 3.832226322950795e-06, "loss": 0.002, "num_tokens": 31616308.0, "reward": 1.6785714626312256, "reward_std": 0.19838999211788177, "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.19839002192020416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7408227264342372, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.056426588678732514, "learning_rate": 3.827158176732082e-06, "loss": 0.0023, "num_tokens": 31621040.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.7410071942446043, "frac_reward_zero_std": 1.0, "grad_norm": 0.388671875, "kl": 0.05714013963006437, "learning_rate": 3.8220925908748595e-06, "loss": 0.0023, "num_tokens": 31626196.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.7411916620549714, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.05246015125885606, "learning_rate": 3.817029567480228e-06, "loss": 0.0021, "num_tokens": 31632737.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 311.75, "completions/mean_terminated_length": 311.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.7413761298653385, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.041725037852302194, "learning_rate": 3.811969108648218e-06, "loss": 0.0017, "num_tokens": 31639951.0, "reward": 1.2307692766189575, "reward_std": 0.1695302128791809, "rewards/fixed_code_pass_all_test_reward/mean": 0.23076923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.1695302128791809, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 150.375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7415605976757056, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.06799555360339582, "learning_rate": 3.8069112164777977e-06, "loss": 0.0027, "num_tokens": 31643874.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 294.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7417450654860727, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.06183592323213816, "learning_rate": 3.8018558930668694e-06, "loss": 0.0025, "num_tokens": 31653781.0, "reward": 1.6011905670166016, "reward_std": 0.48021262884140015, "rewards/fixed_code_pass_all_test_reward/mean": 0.726190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.26113057136535645, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 308.0, "completions/mean_terminated_length": 308.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7419295332964397, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.07268867339007556, "learning_rate": 3.7968031405122785e-06, "loss": 0.0029, "num_tokens": 31663589.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 386.25, "completions/mean_terminated_length": 386.25, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7421140011068068, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.02726483717560768, "learning_rate": 3.7917529609097937e-06, "loss": 0.0011, "num_tokens": 31672103.0, "reward": 1.954787254333496, "reward_std": 0.044684454798698425, "rewards/fixed_code_pass_all_test_reward/mean": 0.9547872543334961, "rewards/fixed_code_pass_all_test_reward/std": 0.04468446597456932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 329.875, "completions/mean_terminated_length": 329.875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.742298468917174, "frac_reward_zero_std": 1.0, "grad_norm": 0.2314453125, "kl": 0.05865526804700494, "learning_rate": 3.78670535635412e-06, "loss": 0.0023, "num_tokens": 31679278.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 99.375, "completions/mean_terminated_length": 99.375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.7424829367275411, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.073364133015275, "learning_rate": 3.7816603289388964e-06, "loss": 0.0029, "num_tokens": 31682729.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.7426674045379081, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.10003414191305637, "learning_rate": 3.7766178807566868e-06, "loss": 0.004, "num_tokens": 31687182.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7428518723482752, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.10265604895539582, "learning_rate": 3.7715780138989965e-06, "loss": 0.0041, "num_tokens": 31696807.0, "reward": 1.905063271522522, "reward_std": 0.17578865587711334, "rewards/fixed_code_pass_all_test_reward/mean": 0.905063271522522, "rewards/fixed_code_pass_all_test_reward/std": 0.17578864097595215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 258.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7430363401586423, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.0764595225919038, "learning_rate": 3.7665407304562505e-06, "loss": 0.0031, "num_tokens": 31706653.0, "reward": 1.9852941036224365, "reward_std": 0.04159453138709068, "rewards/fixed_code_pass_all_test_reward/mean": 0.9852941036224365, "rewards/fixed_code_pass_all_test_reward/std": 0.04159451276063919, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.7432208079690094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.043306372594088316, "learning_rate": 3.7615060325178054e-06, "loss": 0.0017, "num_tokens": 31715079.0, "reward": 1.711111068725586, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7111111283302307, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.7434052757793765, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.06070500996429473, "learning_rate": 3.7564739221719413e-06, "loss": 0.0024, "num_tokens": 31724272.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.7435897435897436, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.04599152400624007, "learning_rate": 3.751444401505876e-06, "loss": 0.0018, "num_tokens": 31731299.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 141.75, "completions/mean_terminated_length": 141.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.7437742114001107, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.06087044859305024, "learning_rate": 3.746417472605742e-06, "loss": 0.0024, "num_tokens": 31735369.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7439586792104778, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.08288045250810683, "learning_rate": 3.7413931375566026e-06, "loss": 0.0033, "num_tokens": 31739325.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7441431470208448, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.05390598042868078, "learning_rate": 3.7363713984424423e-06, "loss": 0.0022, "num_tokens": 31745248.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7443276148312119, "frac_reward_zero_std": 1.0, "grad_norm": 0.62109375, "kl": 0.10944642825052142, "learning_rate": 3.731352257346165e-06, "loss": 0.0044, "num_tokens": 31749458.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7445120826415791, "frac_reward_zero_std": 1.0, "grad_norm": 0.3125, "kl": 0.11480052396655083, "learning_rate": 3.7263357163496118e-06, "loss": 0.0046, "num_tokens": 31753606.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 264.625, "completions/mean_terminated_length": 264.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.7446965504519462, "frac_reward_zero_std": 0.0, "grad_norm": 3.546875, "kl": 0.04645316186361015, "learning_rate": 3.7213217775335308e-06, "loss": 0.0019, "num_tokens": 31759771.0, "reward": 1.0499999523162842, "reward_std": 0.45039665699005127, "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.16690459847450256, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 278.25, "completions/mean_terminated_length": 278.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.7448810182623132, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.07243277598172426, "learning_rate": 3.7163104429775953e-06, "loss": 0.0029, "num_tokens": 31766261.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 509.125, "completions/mean_terminated_length": 509.125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7450654860726803, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06718243390787393, "learning_rate": 3.711301714760396e-06, "loss": 0.0027, "num_tokens": 31782814.0, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7452499538830474, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.03584563615731895, "learning_rate": 3.70629559495945e-06, "loss": 0.0014, "num_tokens": 31789826.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 376.375, "completions/mean_terminated_length": 376.375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.7454344216934145, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.07629658444784582, "learning_rate": 3.701292085651188e-06, "loss": 0.0031, "num_tokens": 31797317.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7456188895037816, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.07968980586156249, "learning_rate": 3.696291188910954e-06, "loss": 0.0032, "num_tokens": 31807057.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.7458033573141487, "frac_reward_zero_std": 0.0, "grad_norm": 3.796875, "kl": 0.08251904742792249, "learning_rate": 3.691292906813011e-06, "loss": 0.0033, "num_tokens": 31811285.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 305.25, "completions/mean_terminated_length": 305.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.7459878251245158, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.07431204803287983, "learning_rate": 3.6862972414305407e-06, "loss": 0.003, "num_tokens": 31821167.0, "reward": 1.5625, "reward_std": 0.3204349875450134, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.7461722929348829, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.0447557563893497, "learning_rate": 3.6813041948356408e-06, "loss": 0.0018, "num_tokens": 31829253.0, "reward": 1.640625, "reward_std": 0.3499840497970581, "rewards/fixed_code_pass_all_test_reward/mean": 0.640625, "rewards/fixed_code_pass_all_test_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.7463567607452499, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.04233520256821066, "learning_rate": 3.6763137690993156e-06, "loss": 0.0017, "num_tokens": 31834495.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.746541228555617, "frac_reward_zero_std": 1.0, "grad_norm": 0.27734375, "kl": 0.0596973083447665, "learning_rate": 3.6713259662914868e-06, "loss": 0.0024, "num_tokens": 31838544.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7467256963659842, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.0944324133452028, "learning_rate": 3.6663407884809865e-06, "loss": 0.0038, "num_tokens": 31847131.0, "reward": 1.833984375, "reward_std": 0.12414256483316422, "rewards/fixed_code_pass_all_test_reward/mean": 0.833984375, "rewards/fixed_code_pass_all_test_reward/std": 0.12414257228374481, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 470.875, "completions/mean_terminated_length": 470.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.7469101641763513, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.03823802666738629, "learning_rate": 3.661358237735555e-06, "loss": 0.0015, "num_tokens": 31858298.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 208.5, "completions/mean_terminated_length": 208.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7470946319867183, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0485664012376219, "learning_rate": 3.6563783161218545e-06, "loss": 0.0019, "num_tokens": 31865622.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 455.125, "completions/mean_terminated_length": 455.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7472790997970854, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.048089121002703905, "learning_rate": 3.6514010257054456e-06, "loss": 0.0019, "num_tokens": 31876703.0, "reward": 1.6517856121063232, "reward_std": 0.4805813729763031, "rewards/fixed_code_pass_all_test_reward/mean": 0.6517857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.4805813729763031, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 175.375, "completions/mean_terminated_length": 175.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7474635676074525, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.062411981634795666, "learning_rate": 3.6464263685508004e-06, "loss": 0.0025, "num_tokens": 31884434.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7476480354178195, "frac_reward_zero_std": 1.0, "grad_norm": 0.2421875, "kl": 0.05363671435043216, "learning_rate": 3.641454346721296e-06, "loss": 0.0021, "num_tokens": 31888838.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7478325032281867, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.03755750344134867, "learning_rate": 3.6364849622792262e-06, "loss": 0.0015, "num_tokens": 31895821.0, "reward": 1.5238094329833984, "reward_std": 0.06734349578619003, "rewards/fixed_code_pass_all_test_reward/mean": 0.523809552192688, "rewards/fixed_code_pass_all_test_reward/std": 0.06734351813793182, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 343.625, "completions/mean_terminated_length": 343.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7480169710385538, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0597763299010694, "learning_rate": 3.631518217285782e-06, "loss": 0.0024, "num_tokens": 31902346.0, "reward": 1.7249999046325684, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2070196568965912, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.7482014388489209, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.08346698060631752, "learning_rate": 3.6265541138010607e-06, "loss": 0.0033, "num_tokens": 31912697.0, "reward": 1.1574074029922485, "reward_std": 0.02969570830464363, "rewards/fixed_code_pass_all_test_reward/mean": 0.15740740299224854, "rewards/fixed_code_pass_all_test_reward/std": 0.029695691540837288, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 354.0, "completions/mean_terminated_length": 354.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.748385906659288, "frac_reward_zero_std": 1.0, "grad_norm": 1.0859375, "kl": 0.09732995182275772, "learning_rate": 3.621592653884065e-06, "loss": 0.0039, "num_tokens": 31921793.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.748570374469655, "frac_reward_zero_std": 1.0, "grad_norm": 0.0537109375, "kl": 0.03535222291247919, "learning_rate": 3.6166338395927e-06, "loss": 0.0014, "num_tokens": 31927386.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7487548422800221, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.06803541560657322, "learning_rate": 3.61167767298378e-06, "loss": 0.0027, "num_tokens": 31933785.0, "reward": 1.7625000476837158, "reward_std": 0.5289815068244934, "rewards/fixed_code_pass_all_test_reward/mean": 0.887499988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.21001701056957245, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 466.625, "completions/mean_terminated_length": 466.625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.7489393100903893, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.03868067217990756, "learning_rate": 3.6067241561130116e-06, "loss": 0.0015, "num_tokens": 31942846.0, "reward": 1.7125000953674316, "reward_std": 0.09247476607561111, "rewards/fixed_code_pass_all_test_reward/mean": 0.7124999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.0924747884273529, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7491237779007563, "frac_reward_zero_std": 1.0, "grad_norm": 0.040771484375, "kl": 0.027288572513498366, "learning_rate": 3.601773291035009e-06, "loss": 0.0011, "num_tokens": 31950827.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 407.625, "completions/mean_terminated_length": 407.625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.7493082457111234, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.040797405992634594, "learning_rate": 3.596825079803279e-06, "loss": 0.0016, "num_tokens": 31959456.0, "reward": 1.8133116960525513, "reward_std": 0.12911422550678253, "rewards/fixed_code_pass_all_test_reward/mean": 0.8133116960525513, "rewards/fixed_code_pass_all_test_reward/std": 0.12911424040794373, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 193.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7494927135214905, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.08195904735475779, "learning_rate": 3.5918795244702398e-06, "loss": 0.0033, "num_tokens": 31968885.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 491.0, "completions/mean_terminated_length": 491.0, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.7496771813318576, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.06413039285689592, "learning_rate": 3.5869366270871986e-06, "loss": 0.0026, "num_tokens": 31980725.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7498616491422246, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.041907241102308035, "learning_rate": 3.581996389704361e-06, "loss": 0.0017, "num_tokens": 31985971.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 373.625, "completions/mean_terminated_length": 373.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.7500461169525918, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.030650099739432335, "learning_rate": 3.5770588143708318e-06, "loss": 0.0012, "num_tokens": 31992784.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7502305847629589, "frac_reward_zero_std": 1.0, "grad_norm": 0.037841796875, "kl": 0.02191260363906622, "learning_rate": 3.5721239031346067e-06, "loss": 0.0009, "num_tokens": 31999587.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.750415052573326, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.03997144033201039, "learning_rate": 3.567191658042588e-06, "loss": 0.0016, "num_tokens": 32009492.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.750599520383693, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.03093021537642926, "learning_rate": 3.5622620811405594e-06, "loss": 0.0012, "num_tokens": 32013608.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7507839881940601, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.06369916070252657, "learning_rate": 3.5573351744732053e-06, "loss": 0.0025, "num_tokens": 32022387.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 334.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7509684560044272, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.04562111175619066, "learning_rate": 3.5524109400840946e-06, "loss": 0.0018, "num_tokens": 32033416.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 743.0, "completions/mean_terminated_length": 743.0, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.7511529238147944, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.036807326949201524, "learning_rate": 3.5474893800157005e-06, "loss": 0.0015, "num_tokens": 32051200.0, "reward": 1.75, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7513373916251614, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.0813364926725626, "learning_rate": 3.5425704963093787e-06, "loss": 0.0033, "num_tokens": 32060931.0, "reward": 1.298076868057251, "reward_std": 0.28910180926322937, "rewards/fixed_code_pass_all_test_reward/mean": 0.29807692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.28910186886787415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 196.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7515218594355285, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.09096789918839931, "learning_rate": 3.5376542910053735e-06, "loss": 0.0036, "num_tokens": 32069753.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.7517063272458956, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.05976105481386185, "learning_rate": 3.5327407661428203e-06, "loss": 0.0024, "num_tokens": 32076046.0, "reward": 1.899999976158142, "reward_std": 0.1069045215845108, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.10690449178218842, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7518907950562627, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.06078268541023135, "learning_rate": 3.5278299237597425e-06, "loss": 0.0024, "num_tokens": 32080118.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 362.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7520752628666297, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.038403341197408736, "learning_rate": 3.522921765893056e-06, "loss": 0.0015, "num_tokens": 32087595.0, "reward": 1.8517441749572754, "reward_std": 0.01730288751423359, "rewards/fixed_code_pass_all_test_reward/mean": 0.8517441749572754, "rewards/fixed_code_pass_all_test_reward/std": 0.01730288751423359, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7522597306769969, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.052560858661308885, "learning_rate": 3.5180162945785557e-06, "loss": 0.0021, "num_tokens": 32098280.0, "reward": 1.7717392444610596, "reward_std": 0.1826234757900238, "rewards/fixed_code_pass_all_test_reward/mean": 0.77173912525177, "rewards/fixed_code_pass_all_test_reward/std": 0.182623490691185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 472.5, "completions/mean_terminated_length": 472.5, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.752444198487364, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.03500388516113162, "learning_rate": 3.5131135118509253e-06, "loss": 0.0014, "num_tokens": 32107572.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 479.375, "completions/mean_terminated_length": 479.375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.7526286662977311, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.04439922608435154, "learning_rate": 3.50821341974373e-06, "loss": 0.0018, "num_tokens": 32120607.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 296.25, "completions/mean_terminated_length": 296.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7528131341080981, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.07205248414538801, "learning_rate": 3.503316020289429e-06, "loss": 0.0029, "num_tokens": 32127225.0, "reward": 1.6356382369995117, "reward_std": 0.24823956191539764, "rewards/fixed_code_pass_all_test_reward/mean": 0.6356382369995117, "rewards/fixed_code_pass_all_test_reward/std": 0.24823962152004242, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7529976019184652, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.040309366304427385, "learning_rate": 3.4984213155193546e-06, "loss": 0.0016, "num_tokens": 32138089.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 322.25, "completions/mean_terminated_length": 322.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7531820697288323, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.04762192000634968, "learning_rate": 3.49352930746372e-06, "loss": 0.0019, "num_tokens": 32147739.0, "reward": 1.318359375, "reward_std": 0.3207545280456543, "rewards/fixed_code_pass_all_test_reward/mean": 0.318359375, "rewards/fixed_code_pass_all_test_reward/std": 0.3207545578479767, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.7533665375391994, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0306720151565969, "learning_rate": 3.4886399981516337e-06, "loss": 0.0012, "num_tokens": 32154431.0, "reward": 1.7604167461395264, "reward_std": 0.25369295477867126, "rewards/fixed_code_pass_all_test_reward/mean": 0.7604166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.25369298458099365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 424.375, "completions/mean_terminated_length": 424.375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7535510053495665, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.05466834071557969, "learning_rate": 3.4837533896110663e-06, "loss": 0.0022, "num_tokens": 32166898.0, "reward": 1.39194917678833, "reward_std": 0.45102745294570923, "rewards/fixed_code_pass_all_test_reward/mean": 0.3919491767883301, "rewards/fixed_code_pass_all_test_reward/std": 0.45102742314338684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7537354731599336, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.07640819158405066, "learning_rate": 3.478869483868886e-06, "loss": 0.0031, "num_tokens": 32177400.0, "reward": 1.0208333730697632, "reward_std": 0.03857587277889252, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.03857583925127983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.7539199409703007, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.05219264188781381, "learning_rate": 3.4739882829508277e-06, "loss": 0.0021, "num_tokens": 32187625.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.7541044087806678, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.04839524545241147, "learning_rate": 3.4691097888815075e-06, "loss": 0.0019, "num_tokens": 32196999.0, "reward": 1.5113636255264282, "reward_std": 0.6106831431388855, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7542888765910348, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.041621757205575705, "learning_rate": 3.464234003684419e-06, "loss": 0.0017, "num_tokens": 32210046.0, "reward": 1.0833332538604736, "reward_std": 0.03367174416780472, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.033671751618385315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 308.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7544733444014019, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.05318420357070863, "learning_rate": 3.459360929381931e-06, "loss": 0.0021, "num_tokens": 32216977.0, "reward": 1.9330357313156128, "reward_std": 0.18940357863903046, "rewards/fixed_code_pass_all_test_reward/mean": 0.9330357313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.18940360844135284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 426.0, "completions/mean_terminated_length": 426.0, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.7546578122117691, "frac_reward_zero_std": 1.0, "grad_norm": 0.042724609375, "kl": 0.022502449224703014, "learning_rate": 3.4544905679952954e-06, "loss": 0.0009, "num_tokens": 32226209.0, "reward": 1.2083333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333283662796, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.7548422800221362, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.04604104068130255, "learning_rate": 3.4496229215446286e-06, "loss": 0.0018, "num_tokens": 32231608.0, "reward": 1.990384578704834, "reward_std": 0.027196446433663368, "rewards/fixed_code_pass_all_test_reward/mean": 0.990384578704834, "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 326.75, "completions/mean_terminated_length": 326.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7550267478325032, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.04880118346773088, "learning_rate": 3.4447579920489273e-06, "loss": 0.002, "num_tokens": 32242510.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 378.875, "completions/mean_terminated_length": 378.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.7552112156428703, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.025363747190567665, "learning_rate": 3.4398957815260538e-06, "loss": 0.001, "num_tokens": 32251781.0, "reward": 1.625, "reward_std": 0.11785116046667099, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 244.0, "completions/mean_terminated_length": 244.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7553956834532374, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.07120131864212453, "learning_rate": 3.435036291992755e-06, "loss": 0.0028, "num_tokens": 32259893.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 263.125, "completions/mean_terminated_length": 263.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7555801512636044, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.03636761009693146, "learning_rate": 3.43017952546464e-06, "loss": 0.0015, "num_tokens": 32269662.0, "reward": 1.6056337356567383, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6056337952613831, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7557646190739716, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.05298100505024195, "learning_rate": 3.42532548395619e-06, "loss": 0.0021, "num_tokens": 32281073.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7559490868843387, "frac_reward_zero_std": 1.0, "grad_norm": 0.2255859375, "kl": 0.050558980321511626, "learning_rate": 3.4204741694807563e-06, "loss": 0.002, "num_tokens": 32285240.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 313.0, "completions/mean_terminated_length": 313.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.7561335546947058, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.04228200193028897, "learning_rate": 3.415625584050557e-06, "loss": 0.0017, "num_tokens": 32292152.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 356.125, "completions/mean_terminated_length": 356.125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.7563180225050729, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.035631365375593305, "learning_rate": 3.4107797296766867e-06, "loss": 0.0014, "num_tokens": 32299777.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.7565024903154399, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.06862905481830239, "learning_rate": 3.405936608369098e-06, "loss": 0.0027, "num_tokens": 32305108.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.756686958125807, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.026751959463581443, "learning_rate": 3.401096222136613e-06, "loss": 0.0011, "num_tokens": 32312416.0, "reward": 1.25, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 405.5, "completions/mean_terminated_length": 405.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.7568714259361742, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.07312191382516176, "learning_rate": 3.396258572986916e-06, "loss": 0.0029, "num_tokens": 32324252.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7570558937465413, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05097799701616168, "learning_rate": 3.391423662926565e-06, "loss": 0.002, "num_tokens": 32333935.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.7572403615569083, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.060855014249682426, "learning_rate": 3.3865914939609744e-06, "loss": 0.0024, "num_tokens": 32343218.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 309.5, "completions/mean_terminated_length": 309.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7574248293672754, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.0749410460703075, "learning_rate": 3.3817620680944243e-06, "loss": 0.003, "num_tokens": 32352198.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.7576092971776425, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.05241286661475897, "learning_rate": 3.3769353873300535e-06, "loss": 0.0021, "num_tokens": 32361882.0, "reward": 1.9090909957885742, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 305.0, "completions/mean_terminated_length": 305.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7577937649880095, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.06220127840060741, "learning_rate": 3.372111453669864e-06, "loss": 0.0025, "num_tokens": 32368418.0, "reward": 1.1586538553237915, "reward_std": 0.30867800116539, "rewards/fixed_code_pass_all_test_reward/mean": 0.2836538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.17915163934230804, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.7579782327983767, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.06263330788351595, "learning_rate": 3.3672902691147256e-06, "loss": 0.0025, "num_tokens": 32376338.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 355.5, "completions/mean_terminated_length": 355.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7581627006087438, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.039467493537813425, "learning_rate": 3.362471835664357e-06, "loss": 0.0016, "num_tokens": 32383558.0, "reward": 1.5384615659713745, "reward_std": 0.5035797357559204, "rewards/fixed_code_pass_all_test_reward/mean": 0.5384615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.5035797953605652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7583471684191109, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.04247680422849953, "learning_rate": 3.3576561553173413e-06, "loss": 0.0017, "num_tokens": 32390808.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.758531636229478, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.04306232847739011, "learning_rate": 3.3528432300711154e-06, "loss": 0.0017, "num_tokens": 32400645.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.758716104039845, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.08470097626559436, "learning_rate": 3.3480330619219834e-06, "loss": 0.0034, "num_tokens": 32408477.0, "reward": 1.9112902879714966, "reward_std": 0.25090888142585754, "rewards/fixed_code_pass_all_test_reward/mean": 0.9112902879714966, "rewards/fixed_code_pass_all_test_reward/std": 0.25090888142585754, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 247.625, "completions/mean_terminated_length": 247.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7589005718502121, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.08778233313933015, "learning_rate": 3.3432256528650954e-06, "loss": 0.0035, "num_tokens": 32417170.0, "reward": 1.806249976158142, "reward_std": 0.35900408029556274, "rewards/fixed_code_pass_all_test_reward/mean": 0.8062499761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.35900408029556274, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.7590850396605793, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.05824588635005057, "learning_rate": 3.338421004894462e-06, "loss": 0.0023, "num_tokens": 32426265.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.7592695074709463, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.04188563325442374, "learning_rate": 3.3336191200029422e-06, "loss": 0.0017, "num_tokens": 32431815.0, "reward": 1.955592155456543, "reward_std": 0.1256045252084732, "rewards/fixed_code_pass_all_test_reward/mean": 0.9555920958518982, "rewards/fixed_code_pass_all_test_reward/std": 0.12560449540615082, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 320.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.7594539752813134, "frac_reward_zero_std": 1.0, "grad_norm": 0.03466796875, "kl": 0.01530949556035921, "learning_rate": 3.3288200001822624e-06, "loss": 0.0006, "num_tokens": 32438083.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 429.625, "completions/mean_terminated_length": 429.625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7596384430916805, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.02519311720971018, "learning_rate": 3.3240236474229902e-06, "loss": 0.001, "num_tokens": 32446128.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 242.625, "completions/mean_terminated_length": 242.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7598229109020476, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.16856623953208327, "learning_rate": 3.319230063714548e-06, "loss": 0.0067, "num_tokens": 32454845.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7600073787124146, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.03624356118962169, "learning_rate": 3.3144392510452127e-06, "loss": 0.0014, "num_tokens": 32462937.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.7601918465227818, "frac_reward_zero_std": 1.0, "grad_norm": 0.2099609375, "kl": 0.08941646665334702, "learning_rate": 3.309651211402106e-06, "loss": 0.0036, "num_tokens": 32475170.0, "reward": 1.0862069129943848, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.08620689809322357, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.7603763143331489, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.04856453707907349, "learning_rate": 3.3048659467712098e-06, "loss": 0.0019, "num_tokens": 32481807.0, "reward": 1.4558823108673096, "reward_std": 0.7219422459602356, "rewards/fixed_code_pass_all_test_reward/mean": 0.7058823108673096, "rewards/fixed_code_pass_all_test_reward/std": 0.43504926562309265, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 347.625, "completions/mean_terminated_length": 347.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.760560782143516, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.04833842348307371, "learning_rate": 3.300083459137341e-06, "loss": 0.0019, "num_tokens": 32493268.0, "reward": 1.7272727489471436, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 429.75, "completions/mean_terminated_length": 429.75, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.760745249953883, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.031212302390486002, "learning_rate": 3.29530375048418e-06, "loss": 0.0012, "num_tokens": 32501338.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7609297177642501, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.06426128186285496, "learning_rate": 3.2905268227942422e-06, "loss": 0.0026, "num_tokens": 32510613.0, "reward": 1.9272727966308594, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9272727370262146, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7611141855746172, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.07807431858964264, "learning_rate": 3.2857526780488925e-06, "loss": 0.0031, "num_tokens": 32521830.0, "reward": 1.072115421295166, "reward_std": 0.20397312939167023, "rewards/fixed_code_pass_all_test_reward/mean": 0.07211538404226303, "rewards/fixed_code_pass_all_test_reward/std": 0.20397312939167023, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 416.75, "completions/mean_terminated_length": 416.75, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.7612986533849844, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.03534783603390679, "learning_rate": 3.280981318228349e-06, "loss": 0.0014, "num_tokens": 32529468.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 322.5, "completions/mean_terminated_length": 322.5, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.7614831211953514, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.04888261971063912, "learning_rate": 3.2762127453116654e-06, "loss": 0.002, "num_tokens": 32536808.0, "reward": 1.1477272510528564, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 408.125, "completions/mean_terminated_length": 408.125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.7616675890057185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04345703125, "kl": 0.023835362633690238, "learning_rate": 3.271446961276742e-06, "loss": 0.001, "num_tokens": 32545441.0, "reward": 1.1333333253860474, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.13333334028720856, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.7618520568160856, "frac_reward_zero_std": 1.0, "grad_norm": 0.05078125, "kl": 0.04434698726981878, "learning_rate": 3.2666839681003237e-06, "loss": 0.0018, "num_tokens": 32551423.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7620365246264527, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.09588046837598085, "learning_rate": 3.261923767757994e-06, "loss": 0.0038, "num_tokens": 32563563.0, "reward": 1.392241358757019, "reward_std": 0.20164723694324493, "rewards/fixed_code_pass_all_test_reward/mean": 0.39224135875701904, "rewards/fixed_code_pass_all_test_reward/std": 0.2016472965478897, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.7622209924368197, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.03671595093328506, "learning_rate": 3.2571663622241877e-06, "loss": 0.0015, "num_tokens": 32574386.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7624054602471869, "frac_reward_zero_std": 1.0, "grad_norm": 0.1865234375, "kl": 0.057752672117203474, "learning_rate": 3.2524117534721698e-06, "loss": 0.0023, "num_tokens": 32581676.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.762589928057554, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "kl": 0.0601275390945375, "learning_rate": 3.2476599434740508e-06, "loss": 0.0024, "num_tokens": 32588631.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.7627743958679211, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.038150208769366145, "learning_rate": 3.2429109342007747e-06, "loss": 0.0015, "num_tokens": 32594015.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 591.125, "completions/mean_terminated_length": 591.125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.7629588636782881, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.03661694540642202, "learning_rate": 3.238164727622135e-06, "loss": 0.0015, "num_tokens": 32611904.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 555.0, "completions/mean_terminated_length": 555.0, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.7631433314886552, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.04291894240304828, "learning_rate": 3.2334213257067524e-06, "loss": 0.0017, "num_tokens": 32626816.0, "reward": 0.987500011920929, "reward_std": 0.4432697892189026, "rewards/fixed_code_pass_all_test_reward/mean": 0.11250000447034836, "rewards/fixed_code_pass_all_test_reward/std": 0.19835634529590607, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 332.375, "completions/mean_terminated_length": 332.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7633277992990223, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.05417178221978247, "learning_rate": 3.228680730422088e-06, "loss": 0.0022, "num_tokens": 32639635.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7635122671093895, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.07372351619414985, "learning_rate": 3.223942943734433e-06, "loss": 0.0029, "num_tokens": 32647362.0, "reward": 1.1493055820465088, "reward_std": 0.3504021167755127, "rewards/fixed_code_pass_all_test_reward/mean": 0.1493055522441864, "rewards/fixed_code_pass_all_test_reward/std": 0.3504021167755127, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 350.0, "completions/mean_terminated_length": 350.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7636967349197565, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.05170205049216747, "learning_rate": 3.2192079676089284e-06, "loss": 0.0021, "num_tokens": 32656770.0, "reward": 1.6477272510528564, "reward_std": 0.26364758610725403, "rewards/fixed_code_pass_all_test_reward/mean": 0.6477272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.26364758610725403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7638812027301236, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.049579116981476545, "learning_rate": 3.214475804009535e-06, "loss": 0.002, "num_tokens": 32663308.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 186.875, "completions/mean_terminated_length": 186.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.7640656705404907, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.07565089454874396, "learning_rate": 3.209746454899051e-06, "loss": 0.003, "num_tokens": 32667555.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7642501383508578, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.04640974220819771, "learning_rate": 3.205019922239111e-06, "loss": 0.0019, "num_tokens": 32673642.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 208.5, "completions/mean_terminated_length": 208.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.7644346061612248, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.06482901517301798, "learning_rate": 3.2002962079901743e-06, "loss": 0.0026, "num_tokens": 32683334.0, "reward": 1.5357142686843872, "reward_std": 0.06612997502088547, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.06613001972436905, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.764619073971592, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.07071806467138231, "learning_rate": 3.195575314111542e-06, "loss": 0.0028, "num_tokens": 32692248.0, "reward": 1.5635592937469482, "reward_std": 0.22771234810352325, "rewards/fixed_code_pass_all_test_reward/mean": 0.563559353351593, "rewards/fixed_code_pass_all_test_reward/std": 0.22771237790584564, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 239.25, "completions/mean_terminated_length": 239.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.7648035417819591, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.052610260783694685, "learning_rate": 3.190857242561336e-06, "loss": 0.0021, "num_tokens": 32701906.0, "reward": 1.3465189933776855, "reward_std": 0.26404622197151184, "rewards/fixed_code_pass_all_test_reward/mean": 0.34651899337768555, "rewards/fixed_code_pass_all_test_reward/std": 0.26404622197151184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 398.5, "completions/mean_terminated_length": 398.5, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7649880095923262, "frac_reward_zero_std": 1.0, "grad_norm": 0.048583984375, "kl": 0.033253689762204885, "learning_rate": 3.186141995296509e-06, "loss": 0.0013, "num_tokens": 32709854.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7651724774026932, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.05592645052820444, "learning_rate": 3.1814295742728417e-06, "loss": 0.0022, "num_tokens": 32714160.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.7653569452130603, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.046793330227956176, "learning_rate": 3.1767199814449523e-06, "loss": 0.0019, "num_tokens": 32719966.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 608.125, "completions/mean_terminated_length": 608.125, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.7655414130234274, "frac_reward_zero_std": 1.0, "grad_norm": 0.040771484375, "kl": 0.04397892393171787, "learning_rate": 3.172013218766273e-06, "loss": 0.0018, "num_tokens": 32731463.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 308.5, "completions/mean_terminated_length": 308.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.7657258808337944, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.05189069453626871, "learning_rate": 3.167309288189068e-06, "loss": 0.0021, "num_tokens": 32739451.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.7659103486441616, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.03736904147081077, "learning_rate": 3.162608191664426e-06, "loss": 0.0015, "num_tokens": 32750072.0, "reward": 1.3125, "reward_std": 0.5938674807548523, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7660948164545287, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.07086102711036801, "learning_rate": 3.1579099311422566e-06, "loss": 0.0028, "num_tokens": 32757443.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 373.5, "completions/mean_terminated_length": 373.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7662792842648958, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.039972705068066716, "learning_rate": 3.153214508571305e-06, "loss": 0.0016, "num_tokens": 32765063.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7664637520752628, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.056149824522435665, "learning_rate": 3.148521925899126e-06, "loss": 0.0022, "num_tokens": 32773076.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7666482198856299, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.0912590348161757, "learning_rate": 3.1438321850721033e-06, "loss": 0.0037, "num_tokens": 32781689.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 201.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.766832687695997, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.155959015712142, "learning_rate": 3.1391452880354357e-06, "loss": 0.0062, "num_tokens": 32791406.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 317.125, "completions/mean_terminated_length": 317.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7670171555063642, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.05146349244751036, "learning_rate": 3.134461236733153e-06, "loss": 0.0021, "num_tokens": 32802215.0, "reward": 1.9006409645080566, "reward_std": 0.2810296416282654, "rewards/fixed_code_pass_all_test_reward/mean": 0.9006410241127014, "rewards/fixed_code_pass_all_test_reward/std": 0.2810296416282654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7672016233167313, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.056921913754194975, "learning_rate": 3.129780033108096e-06, "loss": 0.0023, "num_tokens": 32811940.0, "reward": 1.1586538553237915, "reward_std": 0.047939348965883255, "rewards/fixed_code_pass_all_test_reward/mean": 0.1586538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.04793936759233475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 512.25, "completions/mean_terminated_length": 512.25, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.7673860911270983, "frac_reward_zero_std": 1.0, "grad_norm": 0.0478515625, "kl": 0.025409430032595992, "learning_rate": 3.1251016791019262e-06, "loss": 0.001, "num_tokens": 32820358.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.7675705589374654, "frac_reward_zero_std": 1.0, "grad_norm": 0.166015625, "kl": 0.05526601197198033, "learning_rate": 3.1204261766551257e-06, "loss": 0.0022, "num_tokens": 32824873.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 304.875, "completions/mean_terminated_length": 304.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7677550267478325, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.05338413082063198, "learning_rate": 3.115753527706986e-06, "loss": 0.0021, "num_tokens": 32833208.0, "reward": 1.2691326141357422, "reward_std": 0.10376635938882828, "rewards/fixed_code_pass_all_test_reward/mean": 0.26913267374038696, "rewards/fixed_code_pass_all_test_reward/std": 0.10376638174057007, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7679394945581995, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.048667240887880325, "learning_rate": 3.1110837341956267e-06, "loss": 0.0019, "num_tokens": 32843649.0, "reward": 1.94021737575531, "reward_std": 0.16909077763557434, "rewards/fixed_code_pass_all_test_reward/mean": 0.9402173757553101, "rewards/fixed_code_pass_all_test_reward/std": 0.16909076273441315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 106.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.7681239623685667, "frac_reward_zero_std": 1.0, "grad_norm": 0.421875, "kl": 0.03958469335339032, "learning_rate": 3.1064167980579783e-06, "loss": 0.0016, "num_tokens": 32847183.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.7683084301789338, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.04493325995281339, "learning_rate": 3.101752721229784e-06, "loss": 0.0018, "num_tokens": 32852728.0, "reward": 1.6465517282485962, "reward_std": 0.22328369319438934, "rewards/fixed_code_pass_all_test_reward/mean": 0.6465517282485962, "rewards/fixed_code_pass_all_test_reward/std": 0.22328370809555054, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7684928979893009, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.054873981745913625, "learning_rate": 3.097091505645601e-06, "loss": 0.0022, "num_tokens": 32857419.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 343.125, "completions/mean_terminated_length": 343.125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7686773657996679, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.02541735756676644, "learning_rate": 3.0924331532387973e-06, "loss": 0.001, "num_tokens": 32864252.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 443.125, "completions/mean_terminated_length": 443.125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.768861833610035, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.04061677400022745, "learning_rate": 3.087777665941565e-06, "loss": 0.0016, "num_tokens": 32874493.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 410.875, "completions/mean_terminated_length": 410.875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.7690463014204021, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.03205172345042229, "learning_rate": 3.0831250456848938e-06, "loss": 0.0013, "num_tokens": 32883124.0, "reward": 1.380357027053833, "reward_std": 0.5131471753120422, "rewards/fixed_code_pass_all_test_reward/mean": 0.38035711646080017, "rewards/fixed_code_pass_all_test_reward/std": 0.5131471157073975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.7692307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.09243243839591742, "learning_rate": 3.0784752943985927e-06, "loss": 0.0037, "num_tokens": 32887750.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7694152370411363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.05263601988554001, "learning_rate": 3.073828414011274e-06, "loss": 0.0021, "num_tokens": 32897789.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7695997048515034, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.059216899797320366, "learning_rate": 3.0691844064503697e-06, "loss": 0.0024, "num_tokens": 32902708.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7697841726618705, "frac_reward_zero_std": 1.0, "grad_norm": 0.32421875, "kl": 0.07763046631589532, "learning_rate": 3.06454327364211e-06, "loss": 0.0031, "num_tokens": 32910234.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7699686404722376, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.049709743820130825, "learning_rate": 3.0599050175115363e-06, "loss": 0.002, "num_tokens": 32920086.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 340.5, "completions/mean_terminated_length": 340.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7701531082826046, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06662346515804529, "learning_rate": 3.0552696399825e-06, "loss": 0.0027, "num_tokens": 32929458.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.7703375760929718, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.08871415350586176, "learning_rate": 3.0506371429776483e-06, "loss": 0.0035, "num_tokens": 32939266.0, "reward": 1.3235294818878174, "reward_std": 0.6852734088897705, "rewards/fixed_code_pass_all_test_reward/mean": 0.4485294222831726, "rewards/fixed_code_pass_all_test_reward/std": 0.46524009108543396, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.7705220439033389, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.07742608245462179, "learning_rate": 3.046007528418451e-06, "loss": 0.0031, "num_tokens": 32948302.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 254.625, "completions/mean_terminated_length": 254.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.770706511713706, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.043623125180602074, "learning_rate": 3.0413807982251666e-06, "loss": 0.0017, "num_tokens": 32953051.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 320.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.770890979524073, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.025232777930796146, "learning_rate": 3.036756954316864e-06, "loss": 0.001, "num_tokens": 32961623.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 436.125, "completions/mean_terminated_length": 436.125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.7710754473344401, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.03606778779067099, "learning_rate": 3.0321359986114096e-06, "loss": 0.0014, "num_tokens": 32970528.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7712599151448072, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.06343024934176356, "learning_rate": 3.0275179330254846e-06, "loss": 0.0025, "num_tokens": 32977780.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 384.25, "completions/mean_terminated_length": 384.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.7714443829551744, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.08843437302857637, "learning_rate": 3.0229027594745564e-06, "loss": 0.0035, "num_tokens": 32985062.0, "reward": 1.8035714626312256, "reward_std": 0.388894647359848, "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.07636035233736038, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.7716288507655414, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.030130058177746832, "learning_rate": 3.018290479872903e-06, "loss": 0.0012, "num_tokens": 32996590.0, "reward": 1.721982717514038, "reward_std": 0.43680253624916077, "rewards/fixed_code_pass_all_test_reward/mean": 0.7219827771186829, "rewards/fixed_code_pass_all_test_reward/std": 0.43680256605148315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7718133185759085, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.049798961728811264, "learning_rate": 3.013681096133595e-06, "loss": 0.002, "num_tokens": 33001896.0, "reward": 1.3177082538604736, "reward_std": 0.2851634621620178, "rewards/fixed_code_pass_all_test_reward/mean": 0.3177083134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.2851634621620178, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 430.5, "completions/mean_terminated_length": 430.5, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.7719977863862756, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.03016367694362998, "learning_rate": 3.0090746101685042e-06, "loss": 0.0012, "num_tokens": 33014140.0, "reward": 1.5494792461395264, "reward_std": 0.2098715901374817, "rewards/fixed_code_pass_all_test_reward/mean": 0.5494791865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2098715603351593, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.7721822541966427, "frac_reward_zero_std": 1.0, "grad_norm": 0.3984375, "kl": 0.068505450617522, "learning_rate": 3.004471023888308e-06, "loss": 0.0027, "num_tokens": 33024126.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7723667220070097, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.041128334822133183, "learning_rate": 2.9998703392024696e-06, "loss": 0.0016, "num_tokens": 33029540.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.7725511898173769, "frac_reward_zero_std": 1.0, "grad_norm": 0.55078125, "kl": 0.09172767726704478, "learning_rate": 2.9952725580192545e-06, "loss": 0.0037, "num_tokens": 33038373.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 369.125, "completions/mean_terminated_length": 369.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.772735657627744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.050914375111460686, "learning_rate": 2.9906776822457206e-06, "loss": 0.002, "num_tokens": 33049606.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.7729201254381111, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.08352124504745007, "learning_rate": 2.98608571378773e-06, "loss": 0.0033, "num_tokens": 33056838.0, "reward": 1.7272727489471436, "reward_std": 0.16833092272281647, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.16833093762397766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7731045932484781, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.053373128175735474, "learning_rate": 2.9814966545499257e-06, "loss": 0.0021, "num_tokens": 33063060.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.7732890610588452, "frac_reward_zero_std": 1.0, "grad_norm": 0.265625, "kl": 0.07579778088256717, "learning_rate": 2.976910506435754e-06, "loss": 0.003, "num_tokens": 33069812.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 486.25, "completions/mean_terminated_length": 486.25, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.7734735288692123, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.028631198685616255, "learning_rate": 2.972327271347448e-06, "loss": 0.0011, "num_tokens": 33082302.0, "reward": 1.5868055820465088, "reward_std": 0.22588133811950684, "rewards/fixed_code_pass_all_test_reward/mean": 0.5868055820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.22588133811950684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 306.5, "completions/mean_terminated_length": 306.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7736579966795795, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.08424355299212039, "learning_rate": 2.9677469511860336e-06, "loss": 0.0034, "num_tokens": 33089242.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.7738424644899465, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.0516217986587435, "learning_rate": 2.963169547851332e-06, "loss": 0.0021, "num_tokens": 33094510.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.7740269323003136, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.04531557555310428, "learning_rate": 2.9585950632419502e-06, "loss": 0.0018, "num_tokens": 33098800.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7742114001106807, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.05260238004848361, "learning_rate": 2.9540234992552854e-06, "loss": 0.0021, "num_tokens": 33107899.0, "reward": 1.8783783912658691, "reward_std": 0.2324952632188797, "rewards/fixed_code_pass_all_test_reward/mean": 0.8783783912658691, "rewards/fixed_code_pass_all_test_reward/std": 0.2324952930212021, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 522.25, "completions/mean_terminated_length": 522.25, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.7743958679210478, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.04122747655492276, "learning_rate": 2.9494548577875195e-06, "loss": 0.0016, "num_tokens": 33120149.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7745803357314148, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.04426332353614271, "learning_rate": 2.9448891407336334e-06, "loss": 0.0018, "num_tokens": 33126214.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.774764803541782, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.054834515787661076, "learning_rate": 2.9403263499873857e-06, "loss": 0.0022, "num_tokens": 33130908.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.7749492713521491, "frac_reward_zero_std": 1.0, "grad_norm": 0.734375, "kl": 0.09434939781203866, "learning_rate": 2.9357664874413215e-06, "loss": 0.0038, "num_tokens": 33139819.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 414.375, "completions/mean_terminated_length": 414.375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.7751337391625162, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.05673120147548616, "learning_rate": 2.931209554986775e-06, "loss": 0.0023, "num_tokens": 33148494.0, "reward": 0.762499988079071, "reward_std": 0.46623218059539795, "rewards/fixed_code_pass_all_test_reward/mean": 0.012500000186264515, "rewards/fixed_code_pass_all_test_reward/std": 0.00505076302215457, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7753182069728832, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0360052939504385, "learning_rate": 2.9266555545138553e-06, "loss": 0.0014, "num_tokens": 33154773.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7755026747832503, "frac_reward_zero_std": 0.0, "grad_norm": 3.40625, "kl": 0.11513505736365914, "learning_rate": 2.922104487911478e-06, "loss": 0.0046, "num_tokens": 33158594.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7756871425936174, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.07071851892396808, "learning_rate": 2.9175563570673194e-06, "loss": 0.0028, "num_tokens": 33165335.0, "reward": 1.7750000953674316, "reward_std": 0.310529500246048, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.31052953004837036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 638.375, "completions/mean_terminated_length": 638.375, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.7758716104039846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458984375, "kl": 0.026999272289685905, "learning_rate": 2.913011163867847e-06, "loss": 0.0011, "num_tokens": 33180306.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 476.125, "completions/mean_terminated_length": 476.125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.7760560782143516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.03292295103892684, "learning_rate": 2.9084689101983076e-06, "loss": 0.0013, "num_tokens": 33195859.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7762405460247187, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.04558067652396858, "learning_rate": 2.9039295979427294e-06, "loss": 0.0018, "num_tokens": 33205829.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 307.125, "completions/mean_terminated_length": 307.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7764250138350858, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.05116949323564768, "learning_rate": 2.899393228983924e-06, "loss": 0.002, "num_tokens": 33214014.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7766094816454528, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.0754763507284224, "learning_rate": 2.8948598052034783e-06, "loss": 0.003, "num_tokens": 33221274.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 329.375, "completions/mean_terminated_length": 329.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7767939494558199, "frac_reward_zero_std": 1.0, "grad_norm": 0.36328125, "kl": 0.06660783616825938, "learning_rate": 2.890329328481758e-06, "loss": 0.0027, "num_tokens": 33228037.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.7769784172661871, "frac_reward_zero_std": 1.0, "grad_norm": 0.248046875, "kl": 0.08671083953231573, "learning_rate": 2.885801800697905e-06, "loss": 0.0035, "num_tokens": 33233193.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.7771628850765542, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.04719036864116788, "learning_rate": 2.8812772237298437e-06, "loss": 0.0019, "num_tokens": 33238478.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7773473528869213, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.07861263770610094, "learning_rate": 2.8767555994542694e-06, "loss": 0.0031, "num_tokens": 33244292.0, "reward": 1.925480842590332, "reward_std": 0.21077220141887665, "rewards/fixed_code_pass_all_test_reward/mean": 0.9254807829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.21077221632003784, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 346.25, "completions/mean_terminated_length": 346.25, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.7775318206972883, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.04728544922545552, "learning_rate": 2.8722369297466555e-06, "loss": 0.0019, "num_tokens": 33253038.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 296.25, "completions/mean_terminated_length": 296.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.7777162885076554, "frac_reward_zero_std": 1.0, "grad_norm": 0.5859375, "kl": 0.051562837325036526, "learning_rate": 2.8677212164812464e-06, "loss": 0.0021, "num_tokens": 33260544.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.7779007563180225, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.0772990039549768, "learning_rate": 2.8632084615310596e-06, "loss": 0.0031, "num_tokens": 33265524.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7780852241283895, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.07035385351628065, "learning_rate": 2.8586986667678962e-06, "loss": 0.0028, "num_tokens": 33271285.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 4218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7782696919387567, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.045751762576401234, "learning_rate": 2.8541918340623174e-06, "loss": 0.0018, "num_tokens": 33278073.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 309.875, "completions/mean_terminated_length": 309.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7784541597491238, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.026994684943929315, "learning_rate": 2.8496879652836593e-06, "loss": 0.0011, "num_tokens": 33285336.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 296.0, "completions/mean_terminated_length": 296.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7786386275594909, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.06472170678898692, "learning_rate": 2.845187062300029e-06, "loss": 0.0026, "num_tokens": 33297864.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 346.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.7788230953698579, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.047903030877932906, "learning_rate": 2.8406891269783077e-06, "loss": 0.0019, "num_tokens": 33307368.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 497.25, "completions/mean_terminated_length": 497.25, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.779007563180225, "frac_reward_zero_std": 0.0, "grad_norm": 0.765625, "kl": 0.03423214145004749, "learning_rate": 2.8361941611841394e-06, "loss": 0.0014, "num_tokens": 33320522.0, "reward": 1.34375, "reward_std": 0.4212544858455658, "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, "rewards/fixed_code_pass_all_test_reward/std": 0.4212545156478882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.7791920309905921, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.08260319475084543, "learning_rate": 2.8317021667819376e-06, "loss": 0.0033, "num_tokens": 33327525.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 372.375, "completions/mean_terminated_length": 372.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.7793764988009593, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.07851535826921463, "learning_rate": 2.827213145634887e-06, "loss": 0.0031, "num_tokens": 33337816.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 464.375, "completions/mean_terminated_length": 464.375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7795609666113263, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.041049459541682154, "learning_rate": 2.8227270996049325e-06, "loss": 0.0016, "num_tokens": 33346739.0, "reward": 1.045454502105713, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.04545454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 334.375, "completions/mean_terminated_length": 334.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7797454344216934, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.05941079882904887, "learning_rate": 2.818244030552796e-06, "loss": 0.0024, "num_tokens": 33353926.0, "reward": 1.6071429252624512, "reward_std": 0.6645983457565308, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3278830349445343, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 133.0, "completions/mean_terminated_length": 133.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.7799299022320605, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.10189767135307193, "learning_rate": 2.8137639403379523e-06, "loss": 0.0041, "num_tokens": 33357862.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 245.0, "completions/mean_terminated_length": 245.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7801143700424276, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.033874317072331905, "learning_rate": 2.8092868308186483e-06, "loss": 0.0014, "num_tokens": 33363142.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7802988378527946, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.04291633632965386, "learning_rate": 2.804812703851888e-06, "loss": 0.0017, "num_tokens": 33370497.0, "reward": 1.9186046123504639, "reward_std": 0.02153071016073227, "rewards/fixed_code_pass_all_test_reward/mean": 0.9186046123504639, "rewards/fixed_code_pass_all_test_reward/std": 0.02153068408370018, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 335.0, "completions/mean_terminated_length": 335.0, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7804833056631618, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04261951241642237, "learning_rate": 2.8003415612934503e-06, "loss": 0.0017, "num_tokens": 33378105.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7806677734735289, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.06398509512655437, "learning_rate": 2.7958734049978644e-06, "loss": 0.0026, "num_tokens": 33389156.0, "reward": 1.9943820238113403, "reward_std": 0.015890035778284073, "rewards/fixed_code_pass_all_test_reward/mean": 0.9943820238113403, "rewards/fixed_code_pass_all_test_reward/std": 0.015890037640929222, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.780852241283896, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.08685298543423414, "learning_rate": 2.7914082368184247e-06, "loss": 0.0035, "num_tokens": 33398403.0, "reward": 1.1428570747375488, "reward_std": 0.6388765573501587, "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.45456865429878235, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.781036709094263, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.0770256775431335, "learning_rate": 2.786946058607187e-06, "loss": 0.0031, "num_tokens": 33402502.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 375.875, "completions/mean_terminated_length": 375.875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7812211769046301, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.08308914816007018, "learning_rate": 2.7824868722149645e-06, "loss": 0.0033, "num_tokens": 33412789.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7814056447149972, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.06521746376529336, "learning_rate": 2.7780306794913358e-06, "loss": 0.0026, "num_tokens": 33425928.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7815901125253644, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.05076069990172982, "learning_rate": 2.7735774822846317e-06, "loss": 0.002, "num_tokens": 33437962.0, "reward": 1.4821429252624512, "reward_std": 0.07393556088209152, "rewards/fixed_code_pass_all_test_reward/mean": 0.4821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 419.0, "completions/mean_terminated_length": 419.0, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.7817745803357314, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.045056732604280114, "learning_rate": 2.7691272824419402e-06, "loss": 0.0018, "num_tokens": 33450026.0, "reward": 1.71875, "reward_std": 0.585349440574646, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.2893187701702118, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.7819590481460985, "frac_reward_zero_std": 1.0, "grad_norm": 0.189453125, "kl": 0.0858864695765078, "learning_rate": 2.764680081809108e-06, "loss": 0.0034, "num_tokens": 33454456.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7821435159564656, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.13541612681001425, "learning_rate": 2.7602358822307417e-06, "loss": 0.0054, "num_tokens": 33463520.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 191.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.7823279837668327, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.052864542696624994, "learning_rate": 2.755794685550197e-06, "loss": 0.0021, "num_tokens": 33467909.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.7825124515771997, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.06911385664716363, "learning_rate": 2.7513564936095872e-06, "loss": 0.0028, "num_tokens": 33476766.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 410.5, "completions/mean_terminated_length": 410.5, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7826969193875669, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.01834303737268783, "learning_rate": 2.7469213082497736e-06, "loss": 0.0007, "num_tokens": 33484122.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.782881387197934, "frac_reward_zero_std": 1.0, "grad_norm": 0.423828125, "kl": 0.0560229200636968, "learning_rate": 2.742489131310381e-06, "loss": 0.0022, "num_tokens": 33492585.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7830658550083011, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.04068236262537539, "learning_rate": 2.73805996462978e-06, "loss": 0.0016, "num_tokens": 33498401.0, "reward": 1.75, "reward_std": 0.22160130739212036, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.22160132229328156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 432.5, "completions/mean_terminated_length": 432.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7832503228186681, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.06580510572530329, "learning_rate": 2.7336338100450943e-06, "loss": 0.0026, "num_tokens": 33510349.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 376.375, "completions/mean_terminated_length": 376.375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.7834347906290352, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.05010412633419037, "learning_rate": 2.7292106693921937e-06, "loss": 0.002, "num_tokens": 33521656.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 357.625, "completions/mean_terminated_length": 357.625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7836192584394023, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.05883651529438794, "learning_rate": 2.7247905445057042e-06, "loss": 0.0024, "num_tokens": 33533461.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 236.375, "completions/mean_terminated_length": 236.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.7838037262497695, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.06159126712009311, "learning_rate": 2.720373437218994e-06, "loss": 0.0025, "num_tokens": 33538240.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.7839881940601365, "frac_reward_zero_std": 1.0, "grad_norm": 0.86328125, "kl": 0.08891172823496163, "learning_rate": 2.71595934936419e-06, "loss": 0.0036, "num_tokens": 33543876.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7841726618705036, "frac_reward_zero_std": 1.0, "grad_norm": 0.057861328125, "kl": 0.03627986507490277, "learning_rate": 2.7115482827721563e-06, "loss": 0.0015, "num_tokens": 33555937.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7843571296808707, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0324509225320071, "learning_rate": 2.70714023927251e-06, "loss": 0.0013, "num_tokens": 33561265.0, "reward": 1.916273593902588, "reward_std": 0.23681403696537018, "rewards/fixed_code_pass_all_test_reward/mean": 0.9162735939025879, "rewards/fixed_code_pass_all_test_reward/std": 0.23681406676769257, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 260.5, "completions/mean_terminated_length": 260.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7845415974912378, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.06144798151217401, "learning_rate": 2.7027352206936084e-06, "loss": 0.0025, "num_tokens": 33569589.0, "reward": 1.462499976158142, "reward_std": 0.16078920662403107, "rewards/fixed_code_pass_all_test_reward/mean": 0.4625000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.16078922152519226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 430.5, "completions/mean_terminated_length": 430.5, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.7847260653016048, "frac_reward_zero_std": 1.0, "grad_norm": 0.057373046875, "kl": 0.03900893859099597, "learning_rate": 2.698333228862564e-06, "loss": 0.0016, "num_tokens": 33578145.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.784910533111972, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.04833257361315191, "learning_rate": 2.6939342656052248e-06, "loss": 0.0019, "num_tokens": 33582557.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7850950009223391, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.09692082740366459, "learning_rate": 2.6895383327461864e-06, "loss": 0.0039, "num_tokens": 33589995.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 320.75, "completions/mean_terminated_length": 320.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7852794687327062, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.0596744401846081, "learning_rate": 2.6851454321087865e-06, "loss": 0.0024, "num_tokens": 33599609.0, "reward": 1.829545497894287, "reward_std": 0.09009382128715515, "rewards/fixed_code_pass_all_test_reward/mean": 0.8295454978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.09009375423192978, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7854639365430732, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.06010839552618563, "learning_rate": 2.680755565515103e-06, "loss": 0.0024, "num_tokens": 33607566.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7856484043534403, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.07157875271514058, "learning_rate": 2.6763687347859633e-06, "loss": 0.0029, "num_tokens": 33617149.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7858328721638074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.057982145342975855, "learning_rate": 2.671984941740926e-06, "loss": 0.0023, "num_tokens": 33623239.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 492.25, "completions/mean_terminated_length": 492.25, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.7860173399741746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.019644893996883184, "learning_rate": 2.667604188198296e-06, "loss": 0.0008, "num_tokens": 33633353.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7862018077845416, "frac_reward_zero_std": 1.0, "grad_norm": 0.57421875, "kl": 0.11078841798007488, "learning_rate": 2.6632264759751104e-06, "loss": 0.0044, "num_tokens": 33640882.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7863862755949087, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.05849815218243748, "learning_rate": 2.658851806887156e-06, "loss": 0.0023, "num_tokens": 33646634.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 4263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.7865707434052758, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.029831243795342743, "learning_rate": 2.6544801827489487e-06, "loss": 0.0012, "num_tokens": 33653083.0, "reward": 1.5789473056793213, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5789473652839661, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7867552112156428, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.05741134285926819, "learning_rate": 2.650111605373743e-06, "loss": 0.0023, "num_tokens": 33663218.0, "reward": 1.6749999523162842, "reward_std": 0.7038688063621521, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.37664297223091125, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 404.25, "completions/mean_terminated_length": 404.25, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7869396790260099, "frac_reward_zero_std": 1.0, "grad_norm": 0.045166015625, "kl": 0.03063931583892554, "learning_rate": 2.645746076573532e-06, "loss": 0.0012, "num_tokens": 33671428.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 647.625, "completions/mean_terminated_length": 647.625, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.7871241468363771, "frac_reward_zero_std": 0.0, "grad_norm": 0.5390625, "kl": 0.0314294493291527, "learning_rate": 2.6413835981590387e-06, "loss": 0.0013, "num_tokens": 33688441.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7873086146467442, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.04912951681762934, "learning_rate": 2.6370241719397306e-06, "loss": 0.002, "num_tokens": 33693031.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 386.875, "completions/mean_terminated_length": 386.875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7874930824571112, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.04286158480681479, "learning_rate": 2.6326677997238002e-06, "loss": 0.0017, "num_tokens": 33704870.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.7876775502674783, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.06275265221484005, "learning_rate": 2.628314483318178e-06, "loss": 0.0025, "num_tokens": 33713416.0, "reward": 1.3125, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7878620180778454, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0769812110811472, "learning_rate": 2.6239642245285223e-06, "loss": 0.0031, "num_tokens": 33718082.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7880464858882125, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.07745312107726932, "learning_rate": 2.619617025159232e-06, "loss": 0.0031, "num_tokens": 33727625.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.7882309536985797, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.050567543832585216, "learning_rate": 2.615272887013429e-06, "loss": 0.002, "num_tokens": 33735855.0, "reward": 1.9539473056793213, "reward_std": 0.1302565187215805, "rewards/fixed_code_pass_all_test_reward/mean": 0.9539473652839661, "rewards/fixed_code_pass_all_test_reward/std": 0.1302565187215805, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.7884154215089467, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.054870487190783024, "learning_rate": 2.6109318118929673e-06, "loss": 0.0022, "num_tokens": 33739500.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 400.625, "completions/mean_terminated_length": 400.625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.7885998893193138, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.02595153928268701, "learning_rate": 2.6065938015984292e-06, "loss": 0.001, "num_tokens": 33747057.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 313.0, "completions/mean_terminated_length": 313.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7887843571296809, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.055943886283785105, "learning_rate": 2.602258857929133e-06, "loss": 0.0022, "num_tokens": 33754209.0, "reward": 1.9342105388641357, "reward_std": 0.1860807240009308, "rewards/fixed_code_pass_all_test_reward/mean": 0.9342105388641357, "rewards/fixed_code_pass_all_test_reward/std": 0.1860807240009308, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 344.625, "completions/mean_terminated_length": 344.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.7889688249400479, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.06887308706063777, "learning_rate": 2.5979269826831155e-06, "loss": 0.0028, "num_tokens": 33766126.0, "reward": 1.2690972089767456, "reward_std": 0.13707898557186127, "rewards/fixed_code_pass_all_test_reward/mean": 0.2690972089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.13707898557186127, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.789153292750415, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.04347641649655998, "learning_rate": 2.5935981776571473e-06, "loss": 0.0017, "num_tokens": 33771672.0, "reward": 1.6398026943206787, "reward_std": 0.11996433883905411, "rewards/fixed_code_pass_all_test_reward/mean": 0.6398026943206787, "rewards/fixed_code_pass_all_test_reward/std": 0.1199643537402153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.7893377605607822, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.0750428237952292, "learning_rate": 2.589272444646723e-06, "loss": 0.003, "num_tokens": 33778443.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 448.75, "completions/mean_terminated_length": 448.75, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.7895222283711493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.02969958307221532, "learning_rate": 2.5849497854460582e-06, "loss": 0.0012, "num_tokens": 33787657.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.7897066961815163, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.04328444669954479, "learning_rate": 2.5806302018481066e-06, "loss": 0.0017, "num_tokens": 33791449.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.7898911639918834, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.07555373013019562, "learning_rate": 2.5763136956445345e-06, "loss": 0.003, "num_tokens": 33800705.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7900756318022505, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.04961626906879246, "learning_rate": 2.572000268625732e-06, "loss": 0.002, "num_tokens": 33805994.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7902600996126176, "frac_reward_zero_std": 1.0, "grad_norm": 0.1767578125, "kl": 0.0772890280932188, "learning_rate": 2.5676899225808216e-06, "loss": 0.0031, "num_tokens": 33811954.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 448.125, "completions/mean_terminated_length": 448.125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7904445674229846, "frac_reward_zero_std": 1.0, "grad_norm": 0.041259765625, "kl": 0.024500319734215736, "learning_rate": 2.5633826592976374e-06, "loss": 0.001, "num_tokens": 33821979.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 317.75, "completions/mean_terminated_length": 317.75, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.7906290352333518, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.0379832818871364, "learning_rate": 2.5590784805627433e-06, "loss": 0.0015, "num_tokens": 33833049.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 328.75, "completions/mean_terminated_length": 328.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.7908135030437189, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.05940041085705161, "learning_rate": 2.5547773881614168e-06, "loss": 0.0024, "num_tokens": 33839023.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 372.0, "completions/mean_terminated_length": 372.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.790997970854086, "frac_reward_zero_std": 1.0, "grad_norm": 0.043701171875, "kl": 0.01747466341475956, "learning_rate": 2.5504793838776585e-06, "loss": 0.0007, "num_tokens": 33846343.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 457.5, "completions/mean_terminated_length": 457.5, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.791182438664453, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.053074328461661935, "learning_rate": 2.5461844694941884e-06, "loss": 0.0021, "num_tokens": 33855603.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.7913669064748201, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.09859229298308492, "learning_rate": 2.541892646792441e-06, "loss": 0.0039, "num_tokens": 33864117.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 379.0, "completions/mean_terminated_length": 379.0, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7915513742851872, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.02346890384797007, "learning_rate": 2.537603917552577e-06, "loss": 0.0009, "num_tokens": 33877533.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.7917358420955544, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.0329418140463531, "learning_rate": 2.5333182835534665e-06, "loss": 0.0013, "num_tokens": 33883238.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7919203099059214, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.04256973718293011, "learning_rate": 2.5290357465726977e-06, "loss": 0.0017, "num_tokens": 33887419.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 291.75, "completions/mean_terminated_length": 291.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7921047777162885, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.02516692184144631, "learning_rate": 2.52475630838657e-06, "loss": 0.001, "num_tokens": 33893473.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 451.625, "completions/mean_terminated_length": 451.625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.7922892455266556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0439453125, "kl": 0.028201518929563463, "learning_rate": 2.520479970770111e-06, "loss": 0.0011, "num_tokens": 33902446.0, "reward": 1.615384578704834, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6153846383094788, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.7924737133370227, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.16995790507644415, "learning_rate": 2.5162067354970475e-06, "loss": 0.0068, "num_tokens": 33911222.0, "reward": 0.9898648262023926, "reward_std": 1.0585428476333618, "rewards/fixed_code_pass_all_test_reward/mean": 0.48986485600471497, "rewards/fixed_code_pass_all_test_reward/std": 0.5243597030639648, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 4296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 610.0, "completions/mean_terminated_length": 610.0, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.7926581811473897, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.03912397148087621, "learning_rate": 2.5119366043398265e-06, "loss": 0.0016, "num_tokens": 33929582.0, "reward": 1.7395833730697632, "reward_std": 0.3462884724140167, "rewards/fixed_code_pass_all_test_reward/mean": 0.7395833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.34628844261169434, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7928426489577569, "frac_reward_zero_std": 1.0, "grad_norm": 0.0439453125, "kl": 0.035325705306604505, "learning_rate": 2.507669579069603e-06, "loss": 0.0014, "num_tokens": 33935292.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.793027116768124, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.07902715587988496, "learning_rate": 2.503405661456252e-06, "loss": 0.0032, "num_tokens": 33938970.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 189.625, "completions/mean_terminated_length": 189.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7932115845784911, "frac_reward_zero_std": 0.0, "grad_norm": 3.703125, "kl": 0.08654956985265017, "learning_rate": 2.4991448532683526e-06, "loss": 0.0035, "num_tokens": 33943255.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7933960523888581, "frac_reward_zero_std": 1.0, "grad_norm": 0.2294921875, "kl": 0.05315377586521208, "learning_rate": 2.4948871562731957e-06, "loss": 0.0021, "num_tokens": 33948600.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 384.875, "completions/mean_terminated_length": 384.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7935805201992252, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.06264660833403468, "learning_rate": 2.490632572236782e-06, "loss": 0.0025, "num_tokens": 33954831.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.7937649880095923, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.09858393343165517, "learning_rate": 2.4863811029238185e-06, "loss": 0.0039, "num_tokens": 33963646.0, "reward": 1.6749999523162842, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 568.375, "completions/mean_terminated_length": 568.375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.7939494558199595, "frac_reward_zero_std": 0.0, "grad_norm": 0.74609375, "kl": 0.018192733637988567, "learning_rate": 2.482132750097729e-06, "loss": 0.0007, "num_tokens": 33977289.0, "reward": 1.9768518209457397, "reward_std": 0.06547286361455917, "rewards/fixed_code_pass_all_test_reward/mean": 0.9768518209457397, "rewards/fixed_code_pass_all_test_reward/std": 0.06547285616397858, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 427.0, "completions/mean_terminated_length": 427.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7941339236303265, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.05917160981334746, "learning_rate": 2.477887515520634e-06, "loss": 0.0024, "num_tokens": 33988673.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.7943183914406936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0546875, "kl": 0.03561734227696434, "learning_rate": 2.473645400953366e-06, "loss": 0.0014, "num_tokens": 33997501.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 516.375, "completions/mean_terminated_length": 516.375, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.7945028592510607, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.029307395918294787, "learning_rate": 2.4694064081554614e-06, "loss": 0.0012, "num_tokens": 34008080.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 255.75, "completions/mean_terminated_length": 255.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.7946873270614278, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.06334744580090046, "learning_rate": 2.4651705388851653e-06, "loss": 0.0025, "num_tokens": 34016054.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.7948717948717948, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.0547631389927119, "learning_rate": 2.460937794899425e-06, "loss": 0.0022, "num_tokens": 34025616.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.795056262682162, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.04421070637181401, "learning_rate": 2.456708177953889e-06, "loss": 0.0018, "num_tokens": 34032747.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.7952407304925291, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.05710498313419521, "learning_rate": 2.4524816898029125e-06, "loss": 0.0023, "num_tokens": 34042710.0, "reward": 1.4712837934494019, "reward_std": 0.6572958827018738, "rewards/fixed_code_pass_all_test_reward/mean": 0.5962837934494019, "rewards/fixed_code_pass_all_test_reward/std": 0.36969050765037537, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 165.375, "completions/mean_terminated_length": 165.375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.7954251983028962, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.06170115573331714, "learning_rate": 2.448258332199548e-06, "loss": 0.0025, "num_tokens": 34050313.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7956096661132632, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.048469264060258865, "learning_rate": 2.444038106895559e-06, "loss": 0.0019, "num_tokens": 34058452.0, "reward": 1.75, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.7957941339236303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.05505953682586551, "learning_rate": 2.4398210156414005e-06, "loss": 0.0022, "num_tokens": 34065481.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7959786017339974, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.05136312241666019, "learning_rate": 2.4356070601862327e-06, "loss": 0.0021, "num_tokens": 34075733.0, "reward": 1.9027776718139648, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.7961630695443646, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.07260117586702108, "learning_rate": 2.431396242277907e-06, "loss": 0.0029, "num_tokens": 34085015.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 408.25, "completions/mean_terminated_length": 408.25, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.7963475373547316, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.02590440830681473, "learning_rate": 2.4271885636629887e-06, "loss": 0.001, "num_tokens": 34092745.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7965320051650987, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.03735036717262119, "learning_rate": 2.4229840260867288e-06, "loss": 0.0015, "num_tokens": 34099731.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7967164729754658, "frac_reward_zero_std": 1.0, "grad_norm": 0.0537109375, "kl": 0.04583033942617476, "learning_rate": 2.418782631293076e-06, "loss": 0.0018, "num_tokens": 34109397.0, "reward": 1.0833333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 315.25, "completions/mean_terminated_length": 315.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7969009407858328, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.05155441234819591, "learning_rate": 2.414584381024682e-06, "loss": 0.0021, "num_tokens": 34118823.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 329.875, "completions/mean_terminated_length": 329.875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.7970854085961999, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.03587420389521867, "learning_rate": 2.410389277022884e-06, "loss": 0.0014, "num_tokens": 34130838.0, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.03125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7972698764065671, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.03195630665868521, "learning_rate": 2.406197321027729e-06, "loss": 0.0013, "num_tokens": 34135050.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7974543442169342, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.09691015537828207, "learning_rate": 2.402008514777943e-06, "loss": 0.0039, "num_tokens": 34140924.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 170.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.7976388120273012, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.050687407376244664, "learning_rate": 2.3978228600109564e-06, "loss": 0.002, "num_tokens": 34148365.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 400.375, "completions/mean_terminated_length": 400.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7978232798376683, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.09580708760768175, "learning_rate": 2.3936403584628886e-06, "loss": 0.0038, "num_tokens": 34159488.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 597.75, "completions/mean_terminated_length": 597.75, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.7980077476480354, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.03497372078709304, "learning_rate": 2.3894610118685458e-06, "loss": 0.0014, "num_tokens": 34174510.0, "reward": 1.995192289352417, "reward_std": 0.013598235324025154, "rewards/fixed_code_pass_all_test_reward/mean": 0.995192289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.013598216697573662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7981922154584025, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.07445475272834301, "learning_rate": 2.3852848219614376e-06, "loss": 0.003, "num_tokens": 34182143.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 204.25, "completions/mean_terminated_length": 204.25, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.7983766832687696, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.04786619823426008, "learning_rate": 2.3811117904737546e-06, "loss": 0.0019, "num_tokens": 34191729.0, "reward": 1.5535714626312256, "reward_std": 0.16967636346817017, "rewards/fixed_code_pass_all_test_reward/mean": 0.5535714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.16967640817165375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 101.875, "completions/mean_terminated_length": 101.875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.7985611510791367, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.042909818701446056, "learning_rate": 2.3769419191363797e-06, "loss": 0.0017, "num_tokens": 34195256.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 228.75, "completions/mean_terminated_length": 228.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.7987456188895038, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.041065021068789065, "learning_rate": 2.3727752096788814e-06, "loss": 0.0016, "num_tokens": 34204838.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 337.25, "completions/mean_terminated_length": 337.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7989300866998709, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.06168402801267803, "learning_rate": 2.3686116638295287e-06, "loss": 0.0025, "num_tokens": 34211848.0, "reward": 1.626329779624939, "reward_std": 0.27842727303504944, "rewards/fixed_code_pass_all_test_reward/mean": 0.626329779624939, "rewards/fixed_code_pass_all_test_reward/std": 0.2784273028373718, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 302.75, "completions/mean_terminated_length": 302.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7991145545102379, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.057169937528669834, "learning_rate": 2.3644512833152677e-06, "loss": 0.0023, "num_tokens": 34224318.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 461.25, "completions/mean_terminated_length": 461.25, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.799299022320605, "frac_reward_zero_std": 1.0, "grad_norm": 0.265625, "kl": 0.04256734484806657, "learning_rate": 2.360294069861733e-06, "loss": 0.0017, "num_tokens": 34232856.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.7994834901309722, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.06874925130978227, "learning_rate": 2.3561400251932463e-06, "loss": 0.0027, "num_tokens": 34240514.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 347.125, "completions/mean_terminated_length": 347.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7996679579413393, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.07630272395908833, "learning_rate": 2.3519891510328143e-06, "loss": 0.0031, "num_tokens": 34251355.0, "reward": 1.966417908668518, "reward_std": 0.0949845016002655, "rewards/fixed_code_pass_all_test_reward/mean": 0.9664179086685181, "rewards/fixed_code_pass_all_test_reward/std": 0.0949845016002655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7998524257517063, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.03134145319927484, "learning_rate": 2.347841449102136e-06, "loss": 0.0013, "num_tokens": 34255490.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 391.0, "completions/mean_terminated_length": 391.0, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.8000368935620734, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.0429565932136029, "learning_rate": 2.3436969211215842e-06, "loss": 0.0017, "num_tokens": 34266298.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 361.875, "completions/mean_terminated_length": 361.875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.8002213613724405, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.04600483528338373, "learning_rate": 2.339555568810221e-06, "loss": 0.0018, "num_tokens": 34272761.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 131.375, "completions/mean_terminated_length": 131.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8004058291828076, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "kl": 0.06564518250524998, "learning_rate": 2.335417393885786e-06, "loss": 0.0026, "num_tokens": 34276492.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.8005902969931747, "frac_reward_zero_std": 0.0, "grad_norm": 3.875, "kl": 0.05857174191623926, "learning_rate": 2.3312823980647115e-06, "loss": 0.0023, "num_tokens": 34281023.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 393.5, "completions/mean_terminated_length": 393.5, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.8007747648035418, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.08510532323271036, "learning_rate": 2.327150583062101e-06, "loss": 0.0034, "num_tokens": 34292531.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 304.125, "completions/mean_terminated_length": 304.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.8009592326139089, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.05377354589290917, "learning_rate": 2.323021950591743e-06, "loss": 0.0022, "num_tokens": 34300964.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.801143700424276, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.043910259613767266, "learning_rate": 2.3188965023661026e-06, "loss": 0.0018, "num_tokens": 34305546.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 495.125, "completions/mean_terminated_length": 495.125, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.801328168234643, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.03241604124195874, "learning_rate": 2.3147742400963267e-06, "loss": 0.0013, "num_tokens": 34314635.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.8015126360450101, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.0595185118727386, "learning_rate": 2.3106551654922448e-06, "loss": 0.0024, "num_tokens": 34320176.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8016971038553773, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.06104171881452203, "learning_rate": 2.3065392802623575e-06, "loss": 0.0024, "num_tokens": 34329570.0, "reward": 1.7845745086669922, "reward_std": 0.2003803849220276, "rewards/fixed_code_pass_all_test_reward/mean": 0.7845745086669922, "rewards/fixed_code_pass_all_test_reward/std": 0.2003803849220276, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8018815716657444, "frac_reward_zero_std": 1.0, "grad_norm": 0.2099609375, "kl": 0.07308926363475621, "learning_rate": 2.302426586113846e-06, "loss": 0.0029, "num_tokens": 34339046.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.8020660394761114, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.05040420638397336, "learning_rate": 2.2983170847525636e-06, "loss": 0.002, "num_tokens": 34344851.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8022505072864785, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0563970603980124, "learning_rate": 2.294210777883047e-06, "loss": 0.0023, "num_tokens": 34354602.0, "reward": 1.9107142686843872, "reward_std": 0.25253817439079285, "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.8024349750968456, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.05759191594552249, "learning_rate": 2.2901076672085034e-06, "loss": 0.0023, "num_tokens": 34359140.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.8026194429072127, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.036787885590456426, "learning_rate": 2.286007754430812e-06, "loss": 0.0015, "num_tokens": 34365037.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.8028039107175797, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.06699393223971128, "learning_rate": 2.2819110412505297e-06, "loss": 0.0027, "num_tokens": 34374890.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8029883785279469, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.045554180862382054, "learning_rate": 2.2778175293668813e-06, "loss": 0.0018, "num_tokens": 34380376.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 423.25, "completions/mean_terminated_length": 423.25, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.803172846338314, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.04043062130222097, "learning_rate": 2.273727220477774e-06, "loss": 0.0016, "num_tokens": 34387818.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8033573141486811, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.03729835129342973, "learning_rate": 2.2696401162797742e-06, "loss": 0.0015, "num_tokens": 34394930.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 434.375, "completions/mean_terminated_length": 434.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8035417819590481, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.055990442633628845, "learning_rate": 2.265556218468127e-06, "loss": 0.0022, "num_tokens": 34402829.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8037262497694152, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.044987838715314865, "learning_rate": 2.2614755287367395e-06, "loss": 0.0018, "num_tokens": 34412576.0, "reward": 1.6689188480377197, "reward_std": 0.38119471073150635, "rewards/fixed_code_pass_all_test_reward/mean": 0.6689189672470093, "rewards/fixed_code_pass_all_test_reward/std": 0.38119471073150635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 373.75, "completions/mean_terminated_length": 373.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.8039107175797823, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.06690167775377631, "learning_rate": 2.257398048778202e-06, "loss": 0.0027, "num_tokens": 34428062.0, "reward": 1.5, "reward_std": 0.4225771427154541, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.4225771427154541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8040951853901495, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.02161662810249254, "learning_rate": 2.2533237802837615e-06, "loss": 0.0009, "num_tokens": 34433756.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8042796532005165, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.04636290157213807, "learning_rate": 2.249252724943336e-06, "loss": 0.0019, "num_tokens": 34438814.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 316.125, "completions/mean_terminated_length": 316.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.8044641210108836, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.08306499756872654, "learning_rate": 2.2451848844455117e-06, "loss": 0.0033, "num_tokens": 34447599.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 292.125, "completions/mean_terminated_length": 292.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.8046485888212507, "frac_reward_zero_std": 1.0, "grad_norm": 0.203125, "kl": 0.03939705720404163, "learning_rate": 2.2411202604775384e-06, "loss": 0.0016, "num_tokens": 34456392.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 295.375, "completions/mean_terminated_length": 295.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8048330566316177, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.05004134401679039, "learning_rate": 2.237058854725337e-06, "loss": 0.002, "num_tokens": 34465475.0, "reward": 1.7054598331451416, "reward_std": 0.16712607443332672, "rewards/fixed_code_pass_all_test_reward/mean": 0.7054597735404968, "rewards/fixed_code_pass_all_test_reward/std": 0.16712608933448792, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 247.25, "completions/mean_terminated_length": 247.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.8050175244419848, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0716640071477741, "learning_rate": 2.233000668873493e-06, "loss": 0.0029, "num_tokens": 34471741.0, "reward": 1.7417582273483276, "reward_std": 0.2932233512401581, "rewards/fixed_code_pass_all_test_reward/mean": 0.7417582273483276, "rewards/fixed_code_pass_all_test_reward/std": 0.29322338104248047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.805201992252352, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.05530515848658979, "learning_rate": 2.228945704605252e-06, "loss": 0.0022, "num_tokens": 34475685.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 326.375, "completions/mean_terminated_length": 326.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.8053864600627191, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.06958321342244744, "learning_rate": 2.224893963602527e-06, "loss": 0.0028, "num_tokens": 34483776.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8055709278730862, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.02307397429831326, "learning_rate": 2.2208454475458875e-06, "loss": 0.0009, "num_tokens": 34488073.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 248.5, "completions/mean_terminated_length": 248.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.8057553956834532, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.03790565812960267, "learning_rate": 2.216800158114577e-06, "loss": 0.0015, "num_tokens": 34497357.0, "reward": 1.7635540962219238, "reward_std": 0.046856433153152466, "rewards/fixed_code_pass_all_test_reward/mean": 0.7635542154312134, "rewards/fixed_code_pass_all_test_reward/std": 0.046856485307216644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 295.75, "completions/mean_terminated_length": 295.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8059398634938203, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.1284835864789784, "learning_rate": 2.2127580969864925e-06, "loss": 0.0051, "num_tokens": 34503875.0, "reward": 1.158602237701416, "reward_std": 0.16724787652492523, "rewards/fixed_code_pass_all_test_reward/mean": 0.15860214829444885, "rewards/fixed_code_pass_all_test_reward/std": 0.16724787652492523, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 620.25, "completions/mean_terminated_length": 620.25, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.8061243313041874, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.040840097004547715, "learning_rate": 2.208719265838193e-06, "loss": 0.0016, "num_tokens": 34515997.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 215.25, "completions/mean_terminated_length": 215.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.8063087991145546, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.03219496022211388, "learning_rate": 2.2046836663448945e-06, "loss": 0.0013, "num_tokens": 34521679.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 293.75, "completions/mean_terminated_length": 293.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8064932669249216, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.06081193033605814, "learning_rate": 2.200651300180483e-06, "loss": 0.0024, "num_tokens": 34529845.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 370.125, "completions/mean_terminated_length": 370.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8066777347352887, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.044433642411604524, "learning_rate": 2.1966221690174947e-06, "loss": 0.0018, "num_tokens": 34536134.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.8068622025456558, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.04920481122098863, "learning_rate": 2.1925962745271258e-06, "loss": 0.002, "num_tokens": 34540835.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8070466703560228, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.056335150031372905, "learning_rate": 2.1885736183792305e-06, "loss": 0.0023, "num_tokens": 34550602.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 575.125, "completions/mean_terminated_length": 575.125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.8072311381663899, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.03894033620599657, "learning_rate": 2.184554202242316e-06, "loss": 0.0016, "num_tokens": 34562227.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8074156059767571, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.08596139820292592, "learning_rate": 2.1805380277835564e-06, "loss": 0.0034, "num_tokens": 34570722.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 430.625, "completions/mean_terminated_length": 430.625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.8076000737871242, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0706554795615375, "learning_rate": 2.176525096668769e-06, "loss": 0.0028, "num_tokens": 34579303.0, "reward": 1.7410714626312256, "reward_std": 0.40394824743270874, "rewards/fixed_code_pass_all_test_reward/mean": 0.7410714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.40394824743270874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 94.25, "completions/mean_terminated_length": 94.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.8077845415974912, "frac_reward_zero_std": 1.0, "grad_norm": 0.18359375, "kl": 0.08044825517572463, "learning_rate": 2.1725154105624347e-06, "loss": 0.0032, "num_tokens": 34582793.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 347.75, "completions/mean_terminated_length": 347.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8079690094078583, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.03737159282900393, "learning_rate": 2.1685089711276785e-06, "loss": 0.0015, "num_tokens": 34589991.0, "reward": 1.7608696222305298, "reward_std": 0.061487555503845215, "rewards/fixed_code_pass_all_test_reward/mean": 0.7608695030212402, "rewards/fixed_code_pass_all_test_reward/std": 0.061487555503845215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8081534772182254, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.038784857373684645, "learning_rate": 2.1645057800262915e-06, "loss": 0.0016, "num_tokens": 34595676.0, "reward": 1.298076868057251, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.42307692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8083379450285925, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.04381165676750243, "learning_rate": 2.1605058389187094e-06, "loss": 0.0018, "num_tokens": 34600370.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.8085224128389596, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.09173696720972657, "learning_rate": 2.1565091494640187e-06, "loss": 0.0037, "num_tokens": 34605342.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 157.75, "completions/mean_terminated_length": 157.75, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8087068806493267, "frac_reward_zero_std": 0.0, "grad_norm": 3.296875, "kl": 0.09678955841809511, "learning_rate": 2.1525157133199636e-06, "loss": 0.0039, "num_tokens": 34612372.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 224.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.8088913484596938, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.048058848129585385, "learning_rate": 2.1485255321429297e-06, "loss": 0.0019, "num_tokens": 34620745.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.8090758162700609, "frac_reward_zero_std": 1.0, "grad_norm": 0.54296875, "kl": 0.0671055248240009, "learning_rate": 2.1445386075879636e-06, "loss": 0.0027, "num_tokens": 34624615.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 327.25, "completions/mean_terminated_length": 327.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8092602840804279, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.024857894401066005, "learning_rate": 2.1405549413087543e-06, "loss": 0.001, "num_tokens": 34631689.0, "reward": 1.432692289352417, "reward_std": 0.027196446433663368, "rewards/fixed_code_pass_all_test_reward/mean": 0.4326923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 291.25, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.809444751890795, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.039008478983305395, "learning_rate": 2.1365745349576395e-06, "loss": 0.0016, "num_tokens": 34640411.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 462.75, "completions/mean_terminated_length": 462.75, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.8096292197011622, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.03626651712693274, "learning_rate": 2.1325973901856033e-06, "loss": 0.0015, "num_tokens": 34649529.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 392.25, "completions/mean_terminated_length": 392.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.8098136875115293, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.05135701014660299, "learning_rate": 2.1286235086422847e-06, "loss": 0.0021, "num_tokens": 34661251.0, "reward": 1.2583333253860474, "reward_std": 0.22378422319889069, "rewards/fixed_code_pass_all_test_reward/mean": 0.25833332538604736, "rewards/fixed_code_pass_all_test_reward/std": 0.2237841933965683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 217.625, "completions/mean_terminated_length": 217.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8099981553218963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.04864265793003142, "learning_rate": 2.1246528919759603e-06, "loss": 0.0019, "num_tokens": 34671280.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8101826231322634, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.045352863147854805, "learning_rate": 2.120685541833558e-06, "loss": 0.0018, "num_tokens": 34678532.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8103670909426305, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.043479743297211826, "learning_rate": 2.116721459860649e-06, "loss": 0.0017, "num_tokens": 34683500.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 362.0, "completions/mean_terminated_length": 362.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8105515587529976, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.0523602026514709, "learning_rate": 2.112760647701444e-06, "loss": 0.0021, "num_tokens": 34695804.0, "reward": 1.947115421295166, "reward_std": 0.1495802402496338, "rewards/fixed_code_pass_all_test_reward/mean": 0.947115421295166, "rewards/fixed_code_pass_all_test_reward/std": 0.14958028495311737, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 440.125, "completions/mean_terminated_length": 440.125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.8107360265633647, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.04520857613533735, "learning_rate": 2.1088031069988103e-06, "loss": 0.0018, "num_tokens": 34703765.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.8109204943737318, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.0309693718154449, "learning_rate": 2.1048488393942455e-06, "loss": 0.0012, "num_tokens": 34710465.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 518.0, "completions/mean_terminated_length": 518.0, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.8111049621840989, "frac_reward_zero_std": 1.0, "grad_norm": 0.04248046875, "kl": 0.03427041182294488, "learning_rate": 2.100897846527896e-06, "loss": 0.0014, "num_tokens": 34721993.0, "reward": 1.1578947305679321, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.15789473056793213, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.811289429994466, "frac_reward_zero_std": 1.0, "grad_norm": 0.166015625, "kl": 0.07129963394254446, "learning_rate": 2.0969501300385452e-06, "loss": 0.0029, "num_tokens": 34730605.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.811473897804833, "frac_reward_zero_std": 1.0, "grad_norm": 0.051025390625, "kl": 0.024099118949379772, "learning_rate": 2.0930056915636256e-06, "loss": 0.001, "num_tokens": 34739464.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 429.5, "completions/mean_terminated_length": 429.5, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.8116583656152001, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.038170263171195984, "learning_rate": 2.0890645327392024e-06, "loss": 0.0015, "num_tokens": 34751716.0, "reward": 1.9388021230697632, "reward_std": 0.17309381067752838, "rewards/fixed_code_pass_all_test_reward/mean": 0.9388021230697632, "rewards/fixed_code_pass_all_test_reward/std": 0.17309385538101196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 438.75, "completions/mean_terminated_length": 438.75, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.8118428334255673, "frac_reward_zero_std": 1.0, "grad_norm": 0.4921875, "kl": 0.05277451322763227, "learning_rate": 2.0851266551999836e-06, "loss": 0.0021, "num_tokens": 34760602.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.8120273012359344, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.0556970895268023, "learning_rate": 2.081192060579312e-06, "loss": 0.0022, "num_tokens": 34771382.0, "reward": 1.96195650100708, "reward_std": 0.10760319232940674, "rewards/fixed_code_pass_all_test_reward/mean": 0.9619565010070801, "rewards/fixed_code_pass_all_test_reward/std": 0.10760319977998734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 295.75, "completions/mean_terminated_length": 295.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.8122117690463014, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.0648818165063858, "learning_rate": 2.077260750509178e-06, "loss": 0.0026, "num_tokens": 34778828.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 568.125, "completions/mean_terminated_length": 568.125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.8123962368566685, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.059691648464649916, "learning_rate": 2.073332726620203e-06, "loss": 0.0024, "num_tokens": 34794389.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.8125807046670356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.039754010271281004, "learning_rate": 2.0694079905416475e-06, "loss": 0.0016, "num_tokens": 34799829.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 305.75, "completions/mean_terminated_length": 305.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8127651724774027, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.07333863899111748, "learning_rate": 2.065486543901404e-06, "loss": 0.0029, "num_tokens": 34805515.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 431.25, "completions/mean_terminated_length": 431.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.8129496402877698, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.050131421186961234, "learning_rate": 2.0615683883260064e-06, "loss": 0.002, "num_tokens": 34816117.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 310.0, "completions/mean_terminated_length": 310.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8131341080981369, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.08254619152285159, "learning_rate": 2.057653525440616e-06, "loss": 0.0033, "num_tokens": 34826349.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.813318575908504, "frac_reward_zero_std": 0.0, "grad_norm": 3.21875, "kl": 0.051478139124810696, "learning_rate": 2.053741956869041e-06, "loss": 0.0021, "num_tokens": 34830551.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.813503043718871, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.07150836288928986, "learning_rate": 2.0498336842337117e-06, "loss": 0.0029, "num_tokens": 34840773.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 259.875, "completions/mean_terminated_length": 259.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8136875115292381, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.035287532140500844, "learning_rate": 2.0459287091556957e-06, "loss": 0.0014, "num_tokens": 34848716.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8138719793396052, "frac_reward_zero_std": 1.0, "grad_norm": 0.26953125, "kl": 0.11479804245755076, "learning_rate": 2.04202703325469e-06, "loss": 0.0046, "num_tokens": 34856594.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8140564471499724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.03660810913424939, "learning_rate": 2.038128658149029e-06, "loss": 0.0015, "num_tokens": 34861001.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.8142409149603395, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.08471974264830351, "learning_rate": 2.0342335854556738e-06, "loss": 0.0034, "num_tokens": 34868803.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8144253827707065, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.060058153700083494, "learning_rate": 2.0303418167902156e-06, "loss": 0.0024, "num_tokens": 34876460.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.8146098505810736, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.03442468645516783, "learning_rate": 2.026453353766876e-06, "loss": 0.0014, "num_tokens": 34885211.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8147943183914407, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.0551202311180532, "learning_rate": 2.0225681979985033e-06, "loss": 0.0022, "num_tokens": 34891729.0, "reward": 1.0233516693115234, "reward_std": 0.37475302815437317, "rewards/fixed_code_pass_all_test_reward/mean": 0.14835165441036224, "rewards/fixed_code_pass_all_test_reward/std": 0.152607724070549, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 354.0, "completions/mean_terminated_length": 354.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8149787862018077, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "kl": 0.09943268820643425, "learning_rate": 2.0186863510965816e-06, "loss": 0.004, "num_tokens": 34902561.0, "reward": 1.7840909957885742, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 388.75, "completions/mean_terminated_length": 388.75, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8151632540121748, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.043318162905052304, "learning_rate": 2.0148078146712134e-06, "loss": 0.0017, "num_tokens": 34914639.0, "reward": 1.454545497894287, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4545454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.815347721822542, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.034407053724862635, "learning_rate": 2.0109325903311326e-06, "loss": 0.0014, "num_tokens": 34921279.0, "reward": 1.78125, "reward_std": 0.405046284198761, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.405046284198761, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 320.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8155321896329091, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.04781987192109227, "learning_rate": 2.0070606796836966e-06, "loss": 0.0019, "num_tokens": 34929931.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.8157166574432761, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.022154912119731307, "learning_rate": 2.003192084334894e-06, "loss": 0.0009, "num_tokens": 34936377.0, "reward": 1.6348038911819458, "reward_std": 0.01796327531337738, "rewards/fixed_code_pass_all_test_reward/mean": 0.6348039507865906, "rewards/fixed_code_pass_all_test_reward/std": 0.017963241785764694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 428.5, "completions/mean_terminated_length": 428.5, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.8159011252536432, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.05417996505275369, "learning_rate": 1.9993268058893344e-06, "loss": 0.0022, "num_tokens": 34944237.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 246.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8160855930640103, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.05676507228054106, "learning_rate": 1.995464845950249e-06, "loss": 0.0023, "num_tokens": 34952135.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.8162700608743774, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.05795103753916919, "learning_rate": 1.991606206119494e-06, "loss": 0.0023, "num_tokens": 34957547.0, "reward": 1.9791666269302368, "reward_std": 0.058925606310367584, "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 178.5, "completions/mean_terminated_length": 178.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.8164545286847446, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.0645260822493583, "learning_rate": 1.987750887997556e-06, "loss": 0.0026, "num_tokens": 34966567.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 362.875, "completions/mean_terminated_length": 362.875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.8166389964951116, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.06078386027365923, "learning_rate": 1.9838988931835345e-06, "loss": 0.0024, "num_tokens": 34974342.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.8168234643054787, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.059508837293833494, "learning_rate": 1.980050223275153e-06, "loss": 0.0024, "num_tokens": 34985067.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 338.625, "completions/mean_terminated_length": 338.625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.8170079321158458, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.023836463573388755, "learning_rate": 1.976204879868757e-06, "loss": 0.001, "num_tokens": 34995776.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8171923999262128, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.07336700800806284, "learning_rate": 1.9723628645593106e-06, "loss": 0.0029, "num_tokens": 35003255.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8173768677365799, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.09418386523611844, "learning_rate": 1.968524178940402e-06, "loss": 0.0038, "num_tokens": 35012494.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.8175613355469471, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.08176109939813614, "learning_rate": 1.964688824604234e-06, "loss": 0.0033, "num_tokens": 35016834.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 286.875, "completions/mean_terminated_length": 286.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.8177458033573142, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.07004867913201451, "learning_rate": 1.9608568031416276e-06, "loss": 0.0028, "num_tokens": 35023201.0, "reward": 1.1521738767623901, "reward_std": 0.061487533152103424, "rewards/fixed_code_pass_all_test_reward/mean": 0.15217390656471252, "rewards/fixed_code_pass_all_test_reward/std": 0.061487551778554916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8179302711676812, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.06435310887172818, "learning_rate": 1.957028116142021e-06, "loss": 0.0026, "num_tokens": 35030849.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8181147389780483, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.07412682380527258, "learning_rate": 1.953202765193476e-06, "loss": 0.003, "num_tokens": 35037403.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 314.625, "completions/mean_terminated_length": 314.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8182992067884154, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.02831485154456459, "learning_rate": 1.949380751882662e-06, "loss": 0.0011, "num_tokens": 35043656.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8184836745987825, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.048057605512440205, "learning_rate": 1.9455620777948693e-06, "loss": 0.0019, "num_tokens": 35049943.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 230.75, "completions/mean_terminated_length": 230.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.8186681424091496, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.059354095719754696, "learning_rate": 1.941746744513999e-06, "loss": 0.0024, "num_tokens": 35057085.0, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8188526102195167, "frac_reward_zero_std": 1.0, "grad_norm": 0.060546875, "kl": 0.033326094038784504, "learning_rate": 1.937934753622568e-06, "loss": 0.0013, "num_tokens": 35066965.0, "reward": 1.569892406463623, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5698924660682678, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 379.625, "completions/mean_terminated_length": 379.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.8190370780298838, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.05013208044692874, "learning_rate": 1.9341261067017137e-06, "loss": 0.002, "num_tokens": 35074682.0, "reward": 1.9615384340286255, "reward_std": 0.07121694087982178, "rewards/fixed_code_pass_all_test_reward/mean": 0.9615384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.07121692597866058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.8192215458402509, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.08339144359342754, "learning_rate": 1.9303208053311763e-06, "loss": 0.0033, "num_tokens": 35079202.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8194060136506179, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.05506377969868481, "learning_rate": 1.926518851089313e-06, "loss": 0.0022, "num_tokens": 35084239.0, "reward": 1.3465909957885742, "reward_std": 0.048211827874183655, "rewards/fixed_code_pass_all_test_reward/mean": 0.34659093618392944, "rewards/fixed_code_pass_all_test_reward/std": 0.048211827874183655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.819590481460985, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.07238885248079896, "learning_rate": 1.9227202455530912e-06, "loss": 0.0029, "num_tokens": 35091177.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8197749492713522, "frac_reward_zero_std": 1.0, "grad_norm": 0.2431640625, "kl": 0.05712813348509371, "learning_rate": 1.918924990298091e-06, "loss": 0.0023, "num_tokens": 35099288.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.8199594170817193, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.07417419599369168, "learning_rate": 1.9151330868985063e-06, "loss": 0.003, "num_tokens": 35107731.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 343.375, "completions/mean_terminated_length": 343.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.8201438848920863, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.04308552504517138, "learning_rate": 1.9113445369271335e-06, "loss": 0.0017, "num_tokens": 35114670.0, "reward": 1.6443965435028076, "reward_std": 0.6644365787506104, "rewards/fixed_code_pass_all_test_reward/mean": 0.7693965435028076, "rewards/fixed_code_pass_all_test_reward/std": 0.31088316440582275, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.8203283527024534, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.03397948038764298, "learning_rate": 1.9075593419553815e-06, "loss": 0.0014, "num_tokens": 35120661.0, "reward": 1.824074149131775, "reward_std": 0.3039681315422058, "rewards/fixed_code_pass_all_test_reward/mean": 0.8240740895271301, "rewards/fixed_code_pass_all_test_reward/std": 0.3039681315422058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.8205128205128205, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.12641977611929178, "learning_rate": 1.903777503553268e-06, "loss": 0.0051, "num_tokens": 35125140.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 248.375, "completions/mean_terminated_length": 248.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8206972883231876, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.04809121147263795, "learning_rate": 1.899999023289414e-06, "loss": 0.0019, "num_tokens": 35131383.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 377.375, "completions/mean_terminated_length": 377.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8208817561335547, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.054706958355382085, "learning_rate": 1.896223902731058e-06, "loss": 0.0022, "num_tokens": 35141458.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 305.75, "completions/mean_terminated_length": 305.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.8210662239439218, "frac_reward_zero_std": 1.0, "grad_norm": 0.345703125, "kl": 0.0597208091057837, "learning_rate": 1.8924521434440346e-06, "loss": 0.0024, "num_tokens": 35149128.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.8212506917542889, "frac_reward_zero_std": 1.0, "grad_norm": 0.62890625, "kl": 0.08523845672607422, "learning_rate": 1.8886837469927899e-06, "loss": 0.0034, "num_tokens": 35155923.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 578.375, "completions/mean_terminated_length": 578.375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.821435159564656, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.047801195876672864, "learning_rate": 1.884918714940369e-06, "loss": 0.0019, "num_tokens": 35166046.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.821619627375023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.05056242924183607, "learning_rate": 1.8811570488484332e-06, "loss": 0.002, "num_tokens": 35170494.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8218040951853901, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.030392852146178484, "learning_rate": 1.877398750277235e-06, "loss": 0.0012, "num_tokens": 35176962.0, "reward": 1.6428571939468384, "reward_std": 0.6638145446777344, "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.3102611303329468, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.8219885629957573, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.05369501654058695, "learning_rate": 1.8736438207856378e-06, "loss": 0.0021, "num_tokens": 35181827.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 462.125, "completions/mean_terminated_length": 462.125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.8221730308061244, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.03807606012560427, "learning_rate": 1.869892261931101e-06, "loss": 0.0015, "num_tokens": 35193052.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 482.875, "completions/mean_terminated_length": 482.875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.8223574986164914, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.025457555311731994, "learning_rate": 1.866144075269697e-06, "loss": 0.001, "num_tokens": 35205435.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.8225419664268585, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.05276914715068415, "learning_rate": 1.8623992623560893e-06, "loss": 0.0021, "num_tokens": 35211761.0, "reward": 1.625, "reward_std": 0.33212438225746155, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.33212441205978394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 403.75, "completions/mean_terminated_length": 403.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8227264342372256, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.03528055548667908, "learning_rate": 1.858657824743545e-06, "loss": 0.0014, "num_tokens": 35218775.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8229109020475927, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.08411538251675665, "learning_rate": 1.854919763983931e-06, "loss": 0.0034, "num_tokens": 35227937.0, "reward": 1.0625, "reward_std": 0.5629958510398865, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 318.875, "completions/mean_terminated_length": 318.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8230953698579598, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.031109151314012706, "learning_rate": 1.8511850816277145e-06, "loss": 0.0012, "num_tokens": 35235176.0, "reward": 1.9891304969787598, "reward_std": 0.030743766576051712, "rewards/fixed_code_pass_all_test_reward/mean": 0.989130437374115, "rewards/fixed_code_pass_all_test_reward/std": 0.030743766576051712, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 409.5, "completions/mean_terminated_length": 409.5, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.8232798376683269, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.03982253448339179, "learning_rate": 1.8474537792239634e-06, "loss": 0.0016, "num_tokens": 35243548.0, "reward": 1.4166666269302368, "reward_std": 0.11311762034893036, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.11311762034893036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.823464305478694, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.06448340602219105, "learning_rate": 1.8437258583203398e-06, "loss": 0.0026, "num_tokens": 35248771.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.823648773289061, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.03661670954898, "learning_rate": 1.8400013204631052e-06, "loss": 0.0015, "num_tokens": 35254767.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 239.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8238332410994281, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.06734549487009645, "learning_rate": 1.8362801671971141e-06, "loss": 0.0027, "num_tokens": 35262720.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 355.125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8240177089097952, "frac_reward_zero_std": 1.0, "grad_norm": 0.048828125, "kl": 0.025641999091021717, "learning_rate": 1.832562400065826e-06, "loss": 0.001, "num_tokens": 35269385.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 249.0, "completions/mean_terminated_length": 249.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.8242021767201624, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.06283437414094806, "learning_rate": 1.8288480206112879e-06, "loss": 0.0025, "num_tokens": 35278681.0, "reward": 1.7063252925872803, "reward_std": 0.0684395357966423, "rewards/fixed_code_pass_all_test_reward/mean": 0.7063252925872803, "rewards/fixed_code_pass_all_test_reward/std": 0.06843952089548111, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.8243866445305295, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.05061634071171284, "learning_rate": 1.8251370303741444e-06, "loss": 0.002, "num_tokens": 35284443.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.8245711123408965, "frac_reward_zero_std": 1.0, "grad_norm": 0.053955078125, "kl": 0.014528581319609657, "learning_rate": 1.8214294308936342e-06, "loss": 0.0006, "num_tokens": 35291150.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8247555801512636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.05038334848359227, "learning_rate": 1.817725223707586e-06, "loss": 0.002, "num_tokens": 35296561.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 369.125, "completions/mean_terminated_length": 369.125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.8249400479616307, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.09062678785994649, "learning_rate": 1.8140244103524317e-06, "loss": 0.0036, "num_tokens": 35303914.0, "reward": 1.125, "reward_std": 0.6408699154853821, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 4472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.8251245157719977, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.03787305543664843, "learning_rate": 1.810326992363184e-06, "loss": 0.0015, "num_tokens": 35312447.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8253089835823649, "frac_reward_zero_std": 1.0, "grad_norm": 0.05615234375, "kl": 0.041573902824893594, "learning_rate": 1.8066329712734543e-06, "loss": 0.0017, "num_tokens": 35317435.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.825493451392732, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.05870689591392875, "learning_rate": 1.8029423486154385e-06, "loss": 0.0023, "num_tokens": 35324654.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8256779192030991, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.06328931520693004, "learning_rate": 1.799255125919933e-06, "loss": 0.0025, "num_tokens": 35330524.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8258623870134661, "frac_reward_zero_std": 1.0, "grad_norm": 0.158203125, "kl": 0.045513433520682156, "learning_rate": 1.795571304716316e-06, "loss": 0.0018, "num_tokens": 35334662.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 369.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8260468548238332, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.048842443618923426, "learning_rate": 1.7918908865325558e-06, "loss": 0.002, "num_tokens": 35345241.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8262313226342003, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.05259828129783273, "learning_rate": 1.7882138728952126e-06, "loss": 0.0021, "num_tokens": 35355560.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.8264157904445675, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.05186265427619219, "learning_rate": 1.7845402653294264e-06, "loss": 0.0021, "num_tokens": 35359869.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 380.875, "completions/mean_terminated_length": 380.875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.8266002582549346, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.027240270166657865, "learning_rate": 1.7808700653589384e-06, "loss": 0.0011, "num_tokens": 35368012.0, "reward": 1.4583333730697632, "reward_std": 0.39591169357299805, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.8267847260653016, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.07014110521413386, "learning_rate": 1.7772032745060642e-06, "loss": 0.0028, "num_tokens": 35375768.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 218.5, "completions/mean_terminated_length": 218.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8269691938756687, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.026263297302648425, "learning_rate": 1.7735398942917081e-06, "loss": 0.0011, "num_tokens": 35384652.0, "reward": 1.3472222089767456, "reward_std": 0.2875273525714874, "rewards/fixed_code_pass_all_test_reward/mean": 0.347222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.2875273525714874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.8271536616860358, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.04232959309592843, "learning_rate": 1.769879926235365e-06, "loss": 0.0017, "num_tokens": 35391847.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8273381294964028, "frac_reward_zero_std": 1.0, "grad_norm": 0.26171875, "kl": 0.061299748020246625, "learning_rate": 1.766223371855106e-06, "loss": 0.0025, "num_tokens": 35395933.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 479.375, "completions/mean_terminated_length": 479.375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.8275225973067699, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.0330633808625862, "learning_rate": 1.7625702326675952e-06, "loss": 0.0013, "num_tokens": 35407280.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.8277070651171371, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.03211999521590769, "learning_rate": 1.7589205101880746e-06, "loss": 0.0013, "num_tokens": 35412421.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.8278915329275042, "frac_reward_zero_std": 1.0, "grad_norm": 1.3125, "kl": 0.13367631996516138, "learning_rate": 1.7552742059303696e-06, "loss": 0.0053, "num_tokens": 35415956.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 354.125, "completions/mean_terminated_length": 354.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8280760007378712, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.03911632183007896, "learning_rate": 1.751631321406886e-06, "loss": 0.0016, "num_tokens": 35423485.0, "reward": 1.6184210777282715, "reward_std": 0.170545756816864, "rewards/fixed_code_pass_all_test_reward/mean": 0.6184210777282715, "rewards/fixed_code_pass_all_test_reward/std": 0.1705457866191864, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 223.75, "completions/mean_terminated_length": 223.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.8282604685482383, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.07974466029554605, "learning_rate": 1.7479918581286193e-06, "loss": 0.0032, "num_tokens": 35432163.0, "reward": 1.40625, "reward_std": 0.2734534442424774, "rewards/fixed_code_pass_all_test_reward/mean": 0.40625, "rewards/fixed_code_pass_all_test_reward/std": 0.2734534442424774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 249.5, "completions/mean_terminated_length": 249.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8284449363586054, "frac_reward_zero_std": 1.0, "grad_norm": 0.051025390625, "kl": 0.035929389065131545, "learning_rate": 1.7443558176051368e-06, "loss": 0.0014, "num_tokens": 35440127.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 275.625, "completions/mean_terminated_length": 275.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8286294041689725, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.04943253262899816, "learning_rate": 1.7407232013445896e-06, "loss": 0.002, "num_tokens": 35446572.0, "reward": 1.3541667461395264, "reward_std": 0.5272030830383301, "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.32146531343460083, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 175.625, "completions/mean_terminated_length": 175.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8288138719793396, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.10926313092932105, "learning_rate": 1.7370940108537094e-06, "loss": 0.0044, "num_tokens": 35457065.0, "reward": 1.2857142686843872, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.8289983397897067, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.044343102956190705, "learning_rate": 1.7334682476378017e-06, "loss": 0.0018, "num_tokens": 35462314.0, "reward": 1.0750000476837158, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.07500000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320515871048, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8291828076000738, "frac_reward_zero_std": 1.0, "grad_norm": 0.04931640625, "kl": 0.025067691109143198, "learning_rate": 1.7298459132007628e-06, "loss": 0.001, "num_tokens": 35469214.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 340.625, "completions/mean_terminated_length": 340.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.8293672754104409, "frac_reward_zero_std": 1.0, "grad_norm": 0.04638671875, "kl": 0.02479601395316422, "learning_rate": 1.7262270090450538e-06, "loss": 0.001, "num_tokens": 35475107.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 279.0, "completions/mean_terminated_length": 279.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8295517432208079, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.06048473156988621, "learning_rate": 1.7226115366717189e-06, "loss": 0.0024, "num_tokens": 35481419.0, "reward": 1.808823585510254, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8088235259056091, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 449.5, "completions/mean_terminated_length": 449.5, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.829736211031175, "frac_reward_zero_std": 1.0, "grad_norm": 0.060791015625, "kl": 0.047978151589632034, "learning_rate": 1.718999497580376e-06, "loss": 0.0019, "num_tokens": 35489439.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8299206788415422, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.055125553626567125, "learning_rate": 1.7153908932692244e-06, "loss": 0.0022, "num_tokens": 35494709.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8301051466519093, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.03573262086138129, "learning_rate": 1.7117857252350335e-06, "loss": 0.0014, "num_tokens": 35501187.0, "reward": 1.375, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.8302896144622763, "frac_reward_zero_std": 0.0, "grad_norm": 3.734375, "kl": 0.09352641040459275, "learning_rate": 1.7081839949731504e-06, "loss": 0.0037, "num_tokens": 35505026.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.8304740822726434, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.03850932186469436, "learning_rate": 1.7045857039774926e-06, "loss": 0.0015, "num_tokens": 35509090.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8306585500830105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.08149336976930499, "learning_rate": 1.700990853740553e-06, "loss": 0.0033, "num_tokens": 35518528.0, "reward": 1.9272727966308594, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9272727370262146, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.8308430178933776, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.04480071447324008, "learning_rate": 1.6973994457534026e-06, "loss": 0.0018, "num_tokens": 35525194.0, "reward": 1.8321428298950195, "reward_std": 0.19820620119571686, "rewards/fixed_code_pass_all_test_reward/mean": 0.8321428894996643, "rewards/fixed_code_pass_all_test_reward/std": 0.19820624589920044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 220.125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.8310274857037447, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.16040954180061817, "learning_rate": 1.6938114815056762e-06, "loss": 0.0064, "num_tokens": 35534379.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 157.125, "completions/mean_terminated_length": 157.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8312119535141118, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.11712422780692577, "learning_rate": 1.6902269624855848e-06, "loss": 0.0047, "num_tokens": 35541092.0, "reward": 1.5394736528396606, "reward_std": 0.49592721462249756, "rewards/fixed_code_pass_all_test_reward/mean": 0.5394736528396606, "rewards/fixed_code_pass_all_test_reward/std": 0.49592724442481995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.8313964213244789, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.0647353280801326, "learning_rate": 1.6866458901799076e-06, "loss": 0.0026, "num_tokens": 35545749.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 388.25, "completions/mean_terminated_length": 388.25, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.831580889134846, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.06705842260271311, "learning_rate": 1.6830682660739995e-06, "loss": 0.0027, "num_tokens": 35553407.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.831765356945213, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.08485883008688688, "learning_rate": 1.6794940916517798e-06, "loss": 0.0034, "num_tokens": 35561958.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 422.375, "completions/mean_terminated_length": 422.375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.8319498247555801, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.040982877486385405, "learning_rate": 1.67592336839574e-06, "loss": 0.0016, "num_tokens": 35570705.0, "reward": 1.723557710647583, "reward_std": 0.3541412651538849, "rewards/fixed_code_pass_all_test_reward/mean": 0.723557710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.35414132475852966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.8321342925659473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.05214020935818553, "learning_rate": 1.672356097786938e-06, "loss": 0.0021, "num_tokens": 35576066.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8323187603763144, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.042158282827585936, "learning_rate": 1.6687922813049974e-06, "loss": 0.0017, "num_tokens": 35581865.0, "reward": 1.159482717514038, "reward_std": 0.3530726134777069, "rewards/fixed_code_pass_all_test_reward/mean": 0.15948276221752167, "rewards/fixed_code_pass_all_test_reward/std": 0.3530726134777069, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 316.375, "completions/mean_terminated_length": 316.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8325032281866814, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.05778200947679579, "learning_rate": 1.6652319204281187e-06, "loss": 0.0023, "num_tokens": 35592244.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8326876959970485, "frac_reward_zero_std": 1.0, "grad_norm": 0.294921875, "kl": 0.08815758000127971, "learning_rate": 1.6616750166330598e-06, "loss": 0.0035, "num_tokens": 35600429.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 417.0, "completions/mean_terminated_length": 417.0, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.8328721638074156, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.021260678506223485, "learning_rate": 1.658121571395147e-06, "loss": 0.0009, "num_tokens": 35609133.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 257.375, "completions/mean_terminated_length": 257.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8330566316177827, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.10811090283095837, "learning_rate": 1.6545715861882705e-06, "loss": 0.0043, "num_tokens": 35618960.0, "reward": 1.625, "reward_std": 0.4010751247406006, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.4010751247406006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 372.875, "completions/mean_terminated_length": 372.875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.8332410994281498, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.024279838835354894, "learning_rate": 1.6510250624848923e-06, "loss": 0.001, "num_tokens": 35627511.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8334255672385169, "frac_reward_zero_std": 1.0, "grad_norm": 0.32421875, "kl": 0.10593490907922387, "learning_rate": 1.6474820017560323e-06, "loss": 0.0042, "num_tokens": 35635709.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 308.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.833610035048884, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.10661570192314684, "learning_rate": 1.643942405471275e-06, "loss": 0.0043, "num_tokens": 35641710.0, "reward": 1.546875, "reward_std": 0.3949316143989563, "rewards/fixed_code_pass_all_test_reward/mean": 0.671875, "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 668.375, "completions/mean_terminated_length": 668.375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.833794502859251, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.03612829139456153, "learning_rate": 1.640406275098768e-06, "loss": 0.0014, "num_tokens": 35655145.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 370.625, "completions/mean_terminated_length": 370.625, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.8339789706696181, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.037613481283187866, "learning_rate": 1.6368736121052199e-06, "loss": 0.0015, "num_tokens": 35663422.0, "reward": 1.8958332538604736, "reward_std": 0.17406213283538818, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.1740621030330658, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 626.625, "completions/mean_terminated_length": 626.625, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.8341634384799852, "frac_reward_zero_std": 1.0, "grad_norm": 0.023681640625, "kl": 0.014594950538594276, "learning_rate": 1.6333444179559078e-06, "loss": 0.0006, "num_tokens": 35675619.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.8343479062903524, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.06999063119292259, "learning_rate": 1.629818694114661e-06, "loss": 0.0028, "num_tokens": 35685403.0, "reward": 1.5740740299224854, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5740740895271301, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8345323741007195, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.029642278619576246, "learning_rate": 1.6262964420438765e-06, "loss": 0.0012, "num_tokens": 35691237.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.8347168419110865, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.07278052251785994, "learning_rate": 1.6227776632045078e-06, "loss": 0.0029, "num_tokens": 35698849.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.8349013097214536, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.06279275193810463, "learning_rate": 1.6192623590560664e-06, "loss": 0.0025, "num_tokens": 35707327.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.8350857775318207, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.05995564186014235, "learning_rate": 1.615750531056628e-06, "loss": 0.0024, "num_tokens": 35714108.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 169.25, "completions/mean_terminated_length": 169.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8352702453421877, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.057407013373449445, "learning_rate": 1.6122421806628208e-06, "loss": 0.0023, "num_tokens": 35719646.0, "reward": 1.4874999523162842, "reward_std": 0.44219425320625305, "rewards/fixed_code_pass_all_test_reward/mean": 0.48749998211860657, "rewards/fixed_code_pass_all_test_reward/std": 0.44219422340393066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.8354547131525549, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.06693766452372074, "learning_rate": 1.608737309329833e-06, "loss": 0.0027, "num_tokens": 35728224.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 437.0, "completions/mean_terminated_length": 437.0, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.835639180962922, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.050541087286546826, "learning_rate": 1.6052359185114075e-06, "loss": 0.002, "num_tokens": 35736768.0, "reward": 1.8896104097366333, "reward_std": 0.11801143735647202, "rewards/fixed_code_pass_all_test_reward/mean": 0.8896104097366333, "rewards/fixed_code_pass_all_test_reward/std": 0.1180114671587944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 466.0, "completions/mean_terminated_length": 466.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8358236487732891, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.039307108614593744, "learning_rate": 1.601738009659849e-06, "loss": 0.0016, "num_tokens": 35744864.0, "reward": 1.4375, "reward_std": 0.810092568397522, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 4531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.8360081165836561, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.03613461903296411, "learning_rate": 1.5982435842260146e-06, "loss": 0.0014, "num_tokens": 35751710.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 202.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.8361925843940232, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.0676859263330698, "learning_rate": 1.5947526436593142e-06, "loss": 0.0027, "num_tokens": 35756202.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 395.75, "completions/mean_terminated_length": 395.75, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.8363770522043903, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.03658604039810598, "learning_rate": 1.591265189407717e-06, "loss": 0.0015, "num_tokens": 35763624.0, "reward": 1.8888888359069824, "reward_std": 0.2375655174255371, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.2375655472278595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8365615200147575, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.06493176124058664, "learning_rate": 1.587781222917738e-06, "loss": 0.0026, "num_tokens": 35768895.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.8367459878251245, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.04892566183116287, "learning_rate": 1.5843007456344595e-06, "loss": 0.002, "num_tokens": 35780591.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8369304556354916, "frac_reward_zero_std": 1.0, "grad_norm": 0.193359375, "kl": 0.06580576882697642, "learning_rate": 1.580823759001503e-06, "loss": 0.0026, "num_tokens": 35785166.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8371149234458587, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.07025847071781754, "learning_rate": 1.5773502644610495e-06, "loss": 0.0028, "num_tokens": 35794389.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8372993912562258, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.06072464166209102, "learning_rate": 1.5738802634538264e-06, "loss": 0.0024, "num_tokens": 35801059.0, "reward": 1.9768518209457397, "reward_std": 0.06547286361455917, "rewards/fixed_code_pass_all_test_reward/mean": 0.9768518209457397, "rewards/fixed_code_pass_all_test_reward/std": 0.06547285616397858, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8374838590665928, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.03456733049824834, "learning_rate": 1.5704137574191202e-06, "loss": 0.0014, "num_tokens": 35807941.0, "reward": 1.9360464811325073, "reward_std": 0.024072064086794853, "rewards/fixed_code_pass_all_test_reward/mean": 0.9360464811325073, "rewards/fixed_code_pass_all_test_reward/std": 0.024072034284472466, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 454.625, "completions/mean_terminated_length": 454.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.83766832687696, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.06821340788155794, "learning_rate": 1.56695074779476e-06, "loss": 0.0027, "num_tokens": 35814530.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.8378527946873271, "frac_reward_zero_std": 1.0, "grad_norm": 0.181640625, "kl": 0.07299035065807402, "learning_rate": 1.5634912360171273e-06, "loss": 0.0029, "num_tokens": 35820213.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 351.25, "completions/mean_terminated_length": 351.25, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.8380372624976942, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.036793201230466366, "learning_rate": 1.5600352235211523e-06, "loss": 0.0015, "num_tokens": 35828167.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.8382217303080612, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.13211276847869158, "learning_rate": 1.5565827117403143e-06, "loss": 0.0053, "num_tokens": 35835950.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8384061981184283, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.051994021050632, "learning_rate": 1.5531337021066439e-06, "loss": 0.0021, "num_tokens": 35841203.0, "reward": 1.4249999523162842, "reward_std": 0.2712405323982239, "rewards/fixed_code_pass_all_test_reward/mean": 0.42500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.27124056220054626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 380.625, "completions/mean_terminated_length": 380.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8385906659287954, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.049963890574872494, "learning_rate": 1.5496881960507126e-06, "loss": 0.002, "num_tokens": 35851400.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 293.5, "completions/mean_terminated_length": 293.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.8387751337391626, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.05149173177778721, "learning_rate": 1.5462461950016438e-06, "loss": 0.0021, "num_tokens": 35856820.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8389596015495296, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.048992145573720336, "learning_rate": 1.542807700387101e-06, "loss": 0.002, "num_tokens": 35863219.0, "reward": 1.4759615659713745, "reward_std": 0.44803386926651, "rewards/fixed_code_pass_all_test_reward/mean": 0.4759615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.4480338990688324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 528.0, "completions/mean_terminated_length": 310.8571472167969, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.8391440693598967, "frac_reward_zero_std": 0.0, "grad_norm": 0.71484375, "kl": 0.07434514863416553, "learning_rate": 1.5393727136333037e-06, "loss": 0.003, "num_tokens": 35873955.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8393285371702638, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.04041556641459465, "learning_rate": 1.535941236165006e-06, "loss": 0.0016, "num_tokens": 35880872.0, "reward": 1.46875, "reward_std": 0.5934646129608154, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.23991121351718903, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 441.875, "completions/mean_terminated_length": 441.875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.8395130049806309, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.056582678749691695, "learning_rate": 1.5325132694055133e-06, "loss": 0.0023, "num_tokens": 35894615.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.8396974727909979, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.03569282602984458, "learning_rate": 1.529088814776668e-06, "loss": 0.0014, "num_tokens": 35900840.0, "reward": 1.4945652484893799, "reward_std": 0.07685945183038712, "rewards/fixed_code_pass_all_test_reward/mean": 0.4945652186870575, "rewards/fixed_code_pass_all_test_reward/std": 0.07685943692922592, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 481.875, "completions/mean_terminated_length": 481.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.839881940601365, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.03948037698864937, "learning_rate": 1.525667873698865e-06, "loss": 0.0016, "num_tokens": 35912583.0, "reward": 1.8674242496490479, "reward_std": 0.06047070398926735, "rewards/fixed_code_pass_all_test_reward/mean": 0.8674242496490479, "rewards/fixed_code_pass_all_test_reward/std": 0.060470618307590485, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 386.375, "completions/mean_terminated_length": 386.375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.8400664084117322, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.02855075802654028, "learning_rate": 1.5222504475910337e-06, "loss": 0.0011, "num_tokens": 35921034.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8402508762220993, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.04105574346613139, "learning_rate": 1.5188365378706504e-06, "loss": 0.0016, "num_tokens": 35932737.0, "reward": 1.3571429252624512, "reward_std": 0.37408778071403503, "rewards/fixed_code_pass_all_test_reward/mean": 0.4821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.2011265754699707, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 197.375, "completions/mean_terminated_length": 197.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8404353440324663, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.0357556453673169, "learning_rate": 1.51542614595373e-06, "loss": 0.0014, "num_tokens": 35940444.0, "reward": 1.7750000953674316, "reward_std": 0.1916925311088562, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.1916925460100174, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 126.25, "completions/mean_terminated_length": 126.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8406198118428334, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.05075794807635248, "learning_rate": 1.5120192732548278e-06, "loss": 0.002, "num_tokens": 35949942.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 487.0, "completions/mean_terminated_length": 487.0, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.8408042796532005, "frac_reward_zero_std": 1.0, "grad_norm": 0.038818359375, "kl": 0.01983599190134555, "learning_rate": 1.5086159211870445e-06, "loss": 0.0008, "num_tokens": 35962990.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.8409887474635676, "frac_reward_zero_std": 0.0, "grad_norm": 3.96875, "kl": 0.04177055228501558, "learning_rate": 1.505216091162015e-06, "loss": 0.0017, "num_tokens": 35967384.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.8411732152739347, "frac_reward_zero_std": 1.0, "grad_norm": 0.060546875, "kl": 0.03273178479867056, "learning_rate": 1.5018197845899162e-06, "loss": 0.0013, "num_tokens": 35975735.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 369.625, "completions/mean_terminated_length": 369.625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.8413576830843018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.05323252035304904, "learning_rate": 1.4984270028794602e-06, "loss": 0.0021, "num_tokens": 35983412.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.8415421508946689, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.07016396475955844, "learning_rate": 1.495037747437904e-06, "loss": 0.0028, "num_tokens": 35989586.0, "reward": 1.9464285373687744, "reward_std": 0.1515229046344757, "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.841726618705036, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.049562288681045175, "learning_rate": 1.4916520196710326e-06, "loss": 0.002, "num_tokens": 35995044.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.841911086515403, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.059520049020648, "learning_rate": 1.488269820983178e-06, "loss": 0.0024, "num_tokens": 36003543.0, "reward": 1.640625, "reward_std": 0.4974825084209442, "rewards/fixed_code_pass_all_test_reward/mean": 0.640625, "rewards/fixed_code_pass_all_test_reward/std": 0.4974825084209442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.8420955543257701, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.047883675433695316, "learning_rate": 1.4848911527772002e-06, "loss": 0.0019, "num_tokens": 36010290.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8422800221361373, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.09339042101055384, "learning_rate": 1.4815160164544984e-06, "loss": 0.0037, "num_tokens": 36016470.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.8424644899465044, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.03168045403435826, "learning_rate": 1.4781444134150048e-06, "loss": 0.0013, "num_tokens": 36023555.0, "reward": 1.9086538553237915, "reward_std": 0.2034544199705124, "rewards/fixed_code_pass_all_test_reward/mean": 0.9086538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.20345443487167358, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 341.25, "completions/mean_terminated_length": 341.25, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.8426489577568714, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.06617761903908104, "learning_rate": 1.4747763450571894e-06, "loss": 0.0026, "num_tokens": 36029613.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 229.875, "completions/mean_terminated_length": 229.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.8428334255672385, "frac_reward_zero_std": 1.0, "grad_norm": 0.58203125, "kl": 0.08377264067530632, "learning_rate": 1.4714118127780553e-06, "loss": 0.0034, "num_tokens": 36041724.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.8430178933776056, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.054046317702159286, "learning_rate": 1.4680508179731345e-06, "loss": 0.0022, "num_tokens": 36050237.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 245.0, "completions/mean_terminated_length": 245.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8432023611879726, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.048010987462475896, "learning_rate": 1.4646933620364957e-06, "loss": 0.0019, "num_tokens": 36057749.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 245.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.8433868289983398, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.05576352798379958, "learning_rate": 1.4613394463607423e-06, "loss": 0.0022, "num_tokens": 36062547.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.8435712968087069, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.06784690357744694, "learning_rate": 1.4579890723370027e-06, "loss": 0.0027, "num_tokens": 36066966.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.843755764619074, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.06122791161760688, "learning_rate": 1.4546422413549422e-06, "loss": 0.0024, "num_tokens": 36076842.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 355.125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.843940232429441, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.061370463110506535, "learning_rate": 1.4512989548027524e-06, "loss": 0.0025, "num_tokens": 36086987.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8441247002398081, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.045988376485183835, "learning_rate": 1.447959214067155e-06, "loss": 0.0018, "num_tokens": 36095469.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 385.625, "completions/mean_terminated_length": 385.625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.8443091680501752, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.029818024369888008, "learning_rate": 1.4446230205334056e-06, "loss": 0.0012, "num_tokens": 36104082.0, "reward": 1.840000033378601, "reward_std": 0.1326649785041809, "rewards/fixed_code_pass_all_test_reward/mean": 0.8400000333786011, "rewards/fixed_code_pass_all_test_reward/std": 0.1326649934053421, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.8444936358605424, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.09938232880085707, "learning_rate": 1.4412903755852837e-06, "loss": 0.004, "num_tokens": 36114473.0, "reward": 1.7704081535339355, "reward_std": 0.42735493183135986, "rewards/fixed_code_pass_all_test_reward/mean": 0.7704081535339355, "rewards/fixed_code_pass_all_test_reward/std": 0.42735496163368225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.8446781036709095, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.06702969665639102, "learning_rate": 1.4379612806050991e-06, "loss": 0.0027, "num_tokens": 36124036.0, "reward": 1.898936152458191, "reward_std": 0.20807908475399017, "rewards/fixed_code_pass_all_test_reward/mean": 0.8989361524581909, "rewards/fixed_code_pass_all_test_reward/std": 0.20807906985282898, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 363.375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.8448625714812765, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.03878262755461037, "learning_rate": 1.4346357369736852e-06, "loss": 0.0016, "num_tokens": 36132255.0, "reward": 1.6624999046325684, "reward_std": 0.2163459062576294, "rewards/fixed_code_pass_all_test_reward/mean": 0.6625000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2163459062576294, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 299.25, "completions/mean_terminated_length": 299.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.8450470392916436, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.05826690560206771, "learning_rate": 1.4313137460704107e-06, "loss": 0.0023, "num_tokens": 36137617.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.8452315071020107, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.04980900790542364, "learning_rate": 1.4279953092731636e-06, "loss": 0.002, "num_tokens": 36144689.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 468.5, "completions/mean_terminated_length": 468.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.8454159749123777, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.05099462205544114, "learning_rate": 1.4246804279583593e-06, "loss": 0.002, "num_tokens": 36155877.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.8456004427227449, "frac_reward_zero_std": 1.0, "grad_norm": 0.05029296875, "kl": 0.03099394403398037, "learning_rate": 1.4213691035009369e-06, "loss": 0.0012, "num_tokens": 36162876.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 123.125, "completions/mean_terminated_length": 123.125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.845784910533112, "frac_reward_zero_std": 0.0, "grad_norm": 3.625, "kl": 0.1165216313675046, "learning_rate": 1.4180613372743678e-06, "loss": 0.0047, "num_tokens": 36166605.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8459693783434791, "frac_reward_zero_std": 1.0, "grad_norm": 0.46875, "kl": 0.04652205645106733, "learning_rate": 1.414757130650638e-06, "loss": 0.0019, "num_tokens": 36173679.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 577.875, "completions/mean_terminated_length": 577.875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.8461538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.03413509973324835, "learning_rate": 1.4114564850002621e-06, "loss": 0.0014, "num_tokens": 36187454.0, "reward": 1.9861111640930176, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.9861111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 496.25, "completions/mean_terminated_length": 496.25, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.8463383139642132, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.02897996106185019, "learning_rate": 1.4081594016922772e-06, "loss": 0.0012, "num_tokens": 36196296.0, "reward": 1.8317307233810425, "reward_std": 0.3524683713912964, "rewards/fixed_code_pass_all_test_reward/mean": 0.9567307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.10768739134073257, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 95.125, "completions/mean_terminated_length": 95.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.8465227817745803, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.06465986650437117, "learning_rate": 1.4048658820942386e-06, "loss": 0.0026, "num_tokens": 36201697.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8467072495849475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.0416982255410403, "learning_rate": 1.4015759275722318e-06, "loss": 0.0017, "num_tokens": 36207177.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8468917173953145, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.06279166927561164, "learning_rate": 1.398289539490858e-06, "loss": 0.0025, "num_tokens": 36215084.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8470761852056816, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.08326053759083152, "learning_rate": 1.3950067192132387e-06, "loss": 0.0033, "num_tokens": 36224901.0, "reward": 1.2166666984558105, "reward_std": 0.25634798407554626, "rewards/fixed_code_pass_all_test_reward/mean": 0.21666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.25634798407554626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 342.25, "completions/mean_terminated_length": 342.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.8472606530160487, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.023244231764692813, "learning_rate": 1.391727468101015e-06, "loss": 0.0009, "num_tokens": 36231607.0, "reward": 1.9479166269302368, "reward_std": 0.1473139524459839, "rewards/fixed_code_pass_all_test_reward/mean": 0.9479166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.1473139226436615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8474451208264158, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.0368062179768458, "learning_rate": 1.3884517875143544e-06, "loss": 0.0015, "num_tokens": 36237825.0, "reward": 1.1304347515106201, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1304347813129425, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 441.125, "completions/mean_terminated_length": 441.125, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.8476295886367828, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.029643778630997986, "learning_rate": 1.3851796788119354e-06, "loss": 0.0012, "num_tokens": 36247130.0, "reward": 1.4791667461395264, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.7291666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 343.625, "completions/mean_terminated_length": 343.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.84781405644715, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.05277453665621579, "learning_rate": 1.381911143350958e-06, "loss": 0.0021, "num_tokens": 36257007.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.8479985242575171, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.05569289228878915, "learning_rate": 1.3786461824871412e-06, "loss": 0.0022, "num_tokens": 36264404.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 357.0, "completions/mean_terminated_length": 357.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8481829920678842, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.028482365829404444, "learning_rate": 1.375384797574718e-06, "loss": 0.0011, "num_tokens": 36271812.0, "reward": 1.2037036418914795, "reward_std": 0.052378296852111816, "rewards/fixed_code_pass_all_test_reward/mean": 0.20370370149612427, "rewards/fixed_code_pass_all_test_reward/std": 0.052378278225660324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8483674598782512, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.07386972522363067, "learning_rate": 1.3721269899664436e-06, "loss": 0.003, "num_tokens": 36276873.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8485519276886183, "frac_reward_zero_std": 1.0, "grad_norm": 0.2734375, "kl": 0.05230918282177299, "learning_rate": 1.3688727610135844e-06, "loss": 0.0021, "num_tokens": 36282725.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.8487363954989854, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.07337671052664518, "learning_rate": 1.3656221120659253e-06, "loss": 0.0029, "num_tokens": 36288911.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8489208633093526, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.1016972167417407, "learning_rate": 1.362375044471762e-06, "loss": 0.0041, "num_tokens": 36293145.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 217.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.8491053311197196, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.10109318885952234, "learning_rate": 1.359131559577911e-06, "loss": 0.004, "num_tokens": 36301457.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 432.875, "completions/mean_terminated_length": 432.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.8492897989300867, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.03986170864664018, "learning_rate": 1.3558916587297011e-06, "loss": 0.0016, "num_tokens": 36312768.0, "reward": 1.9017857313156128, "reward_std": 0.27779191732406616, "rewards/fixed_code_pass_all_test_reward/mean": 0.9017857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.27779194712638855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8494742667404538, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.08625580370426178, "learning_rate": 1.352655343270972e-06, "loss": 0.0035, "num_tokens": 36316820.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8496587345508209, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0695064296014607, "learning_rate": 1.349422614544077e-06, "loss": 0.0028, "num_tokens": 36325980.0, "reward": 1.7884615659713745, "reward_std": 0.3916930854320526, "rewards/fixed_code_pass_all_test_reward/mean": 0.7884615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.391693115234375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.8498432023611879, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.05126539128832519, "learning_rate": 1.346193473889883e-06, "loss": 0.0021, "num_tokens": 36333638.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 366.75, "completions/mean_terminated_length": 366.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.8500276701715551, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.060020266799256206, "learning_rate": 1.3429679226477631e-06, "loss": 0.0024, "num_tokens": 36345956.0, "reward": 1.8257575035095215, "reward_std": 0.0404941663146019, "rewards/fixed_code_pass_all_test_reward/mean": 0.825757622718811, "rewards/fixed_code_pass_all_test_reward/std": 0.04049412161111832, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 492.625, "completions/mean_terminated_length": 492.625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.8502121379819222, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.07930431282147765, "learning_rate": 1.339745962155613e-06, "loss": 0.0032, "num_tokens": 36359969.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 244.875, "completions/mean_terminated_length": 244.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8503966057922893, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.03303694981150329, "learning_rate": 1.33652759374983e-06, "loss": 0.0013, "num_tokens": 36366040.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.8505810736026563, "frac_reward_zero_std": 1.0, "grad_norm": 0.2138671875, "kl": 0.10135067347437143, "learning_rate": 1.3333128187653232e-06, "loss": 0.0041, "num_tokens": 36371562.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 362.875, "completions/mean_terminated_length": 362.875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.8507655414130234, "frac_reward_zero_std": 1.0, "grad_norm": 0.061279296875, "kl": 0.06321327341720462, "learning_rate": 1.3301016385355093e-06, "loss": 0.0025, "num_tokens": 36385345.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8509500092233905, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.05432414240203798, "learning_rate": 1.3268940543923226e-06, "loss": 0.0022, "num_tokens": 36389709.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 162.625, "completions/mean_terminated_length": 162.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8511344770337577, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.07068352727219462, "learning_rate": 1.3236900676661956e-06, "loss": 0.0028, "num_tokens": 36393874.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8513189448441247, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.09748638700693846, "learning_rate": 1.3204896796860734e-06, "loss": 0.0039, "num_tokens": 36403119.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8515034126544918, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.07277064584195614, "learning_rate": 1.3172928917794059e-06, "loss": 0.0029, "num_tokens": 36408738.0, "reward": 1.3495371341705322, "reward_std": 0.3737199604511261, "rewards/fixed_code_pass_all_test_reward/mean": 0.34953707456588745, "rewards/fixed_code_pass_all_test_reward/std": 0.3737199902534485, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8516878804648589, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.03887731162831187, "learning_rate": 1.314099705272156e-06, "loss": 0.0016, "num_tokens": 36414911.0, "reward": 1.7000000476837158, "reward_std": 0.18516398966312408, "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.18516401946544647, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.851872348275226, "frac_reward_zero_std": 1.0, "grad_norm": 0.41015625, "kl": 0.10937020555138588, "learning_rate": 1.3109101214887865e-06, "loss": 0.0044, "num_tokens": 36424646.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.852056816085593, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.051818793755955994, "learning_rate": 1.307724141752267e-06, "loss": 0.0021, "num_tokens": 36429075.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 448.0, "completions/mean_terminated_length": 448.0, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.8522412838959601, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.06165636237710714, "learning_rate": 1.3045417673840743e-06, "loss": 0.0025, "num_tokens": 36437035.0, "reward": 1.8928571939468384, "reward_std": 0.14787116646766663, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.14787118136882782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 727.875, "completions/mean_terminated_length": 727.875, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.8524257517063273, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.03653772303368896, "learning_rate": 1.3013629997041854e-06, "loss": 0.0015, "num_tokens": 36453754.0, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 300.25, "completions/mean_terminated_length": 300.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.8526102195166944, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.055946414126083255, "learning_rate": 1.29818784003109e-06, "loss": 0.0022, "num_tokens": 36462044.0, "reward": 1.21875, "reward_std": 0.3705088198184967, "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, "rewards/fixed_code_pass_all_test_reward/std": 0.3705088496208191, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8527946873270614, "frac_reward_zero_std": 0.0, "grad_norm": 3.21875, "kl": 0.05257557868026197, "learning_rate": 1.295016289681772e-06, "loss": 0.0021, "num_tokens": 36466917.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8529791551374285, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.03461666824296117, "learning_rate": 1.2918483499717238e-06, "loss": 0.0014, "num_tokens": 36474464.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8531636229477956, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.09462274890393019, "learning_rate": 1.2886840222149356e-06, "loss": 0.0038, "num_tokens": 36482713.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8533480907581626, "frac_reward_zero_std": 0.0, "grad_norm": 3.203125, "kl": 0.09526948165148497, "learning_rate": 1.2855233077239048e-06, "loss": 0.0038, "num_tokens": 36491052.0, "reward": 1.6409574747085571, "reward_std": 0.4970894455909729, "rewards/fixed_code_pass_all_test_reward/mean": 0.6409574747085571, "rewards/fixed_code_pass_all_test_reward/std": 0.4970894455909729, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8535325585685298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.06142843747511506, "learning_rate": 1.2823662078096277e-06, "loss": 0.0025, "num_tokens": 36498848.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8537170263788969, "frac_reward_zero_std": 1.0, "grad_norm": 0.049072265625, "kl": 0.014961406588554382, "learning_rate": 1.2792127237816e-06, "loss": 0.0006, "num_tokens": 36504813.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.853901494189264, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.04095750884152949, "learning_rate": 1.2760628569478196e-06, "loss": 0.0016, "num_tokens": 36511646.0, "reward": 1.5178570747375488, "reward_std": 0.02164611592888832, "rewards/fixed_code_pass_all_test_reward/mean": 0.5178571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.021646136417984962, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 331.75, "completions/mean_terminated_length": 331.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.854085961999631, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.05302453716285527, "learning_rate": 1.2729166086147803e-06, "loss": 0.0021, "num_tokens": 36520508.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 436.125, "completions/mean_terminated_length": 436.125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.8542704298099981, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.03281714301556349, "learning_rate": 1.269773980087482e-06, "loss": 0.0013, "num_tokens": 36528685.0, "reward": 0.9852941036224365, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.23529411852359772, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8544548976203652, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.08635202026925981, "learning_rate": 1.2666349726694183e-06, "loss": 0.0035, "num_tokens": 36538956.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.8546393654307324, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.09114026417955756, "learning_rate": 1.263499587662581e-06, "loss": 0.0036, "num_tokens": 36547894.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 679.625, "completions/mean_terminated_length": 679.625, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.8548238332410995, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.03222242963965982, "learning_rate": 1.2603678263674569e-06, "loss": 0.0013, "num_tokens": 36560675.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 401.375, "completions/mean_terminated_length": 401.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8550083010514665, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.05930410511791706, "learning_rate": 1.2572396900830397e-06, "loss": 0.0024, "num_tokens": 36571606.0, "reward": 1.3020833730697632, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.3020833432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8551927688618336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.03827747912146151, "learning_rate": 1.2541151801068075e-06, "loss": 0.0015, "num_tokens": 36581118.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 426.5, "completions/mean_terminated_length": 426.5, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.8553772366722007, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.03471438167616725, "learning_rate": 1.2509942977347421e-06, "loss": 0.0014, "num_tokens": 36589954.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 408.25, "completions/mean_terminated_length": 408.25, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.8555617044825677, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.032548227813094854, "learning_rate": 1.2478770442613176e-06, "loss": 0.0013, "num_tokens": 36601268.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 293.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.8557461722929349, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.05272328876890242, "learning_rate": 1.2447634209795e-06, "loss": 0.0021, "num_tokens": 36612462.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 237.0, "completions/mean_terminated_length": 237.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.855930640103302, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.04505148041062057, "learning_rate": 1.2416534291807592e-06, "loss": 0.0018, "num_tokens": 36618390.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8561151079136691, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.044333836529403925, "learning_rate": 1.2385470701550495e-06, "loss": 0.0018, "num_tokens": 36626054.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 390.125, "completions/mean_terminated_length": 390.125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.8562995757240361, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.044729172019287944, "learning_rate": 1.2354443451908205e-06, "loss": 0.0018, "num_tokens": 36634167.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 306.75, "completions/mean_terminated_length": 306.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.8564840435344032, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.0423901358153671, "learning_rate": 1.2323452555750149e-06, "loss": 0.0017, "num_tokens": 36645197.0, "reward": 1.8828125, "reward_std": 0.3314563035964966, "rewards/fixed_code_pass_all_test_reward/mean": 0.8828125, "rewards/fixed_code_pass_all_test_reward/std": 0.3314563035964966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 344.375, "completions/mean_terminated_length": 344.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.8566685113447703, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.09075038600713015, "learning_rate": 1.2292498025930699e-06, "loss": 0.0036, "num_tokens": 36655912.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 305.875, "completions/mean_terminated_length": 305.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8568529791551375, "frac_reward_zero_std": 1.0, "grad_norm": 0.51953125, "kl": 0.11678973957896233, "learning_rate": 1.2261579875289153e-06, "loss": 0.0047, "num_tokens": 36663231.0, "reward": 1.1818182468414307, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1818181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 415.25, "completions/mean_terminated_length": 415.25, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.8570374469655045, "frac_reward_zero_std": 1.0, "grad_norm": 0.02978515625, "kl": 0.02362314914353192, "learning_rate": 1.2230698116649654e-06, "loss": 0.0009, "num_tokens": 36672225.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 405.25, "completions/mean_terminated_length": 405.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8572219147758716, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.07097873976454139, "learning_rate": 1.2199852762821307e-06, "loss": 0.0028, "num_tokens": 36681627.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 432.625, "completions/mean_terminated_length": 432.625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8574063825862387, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.030467518139630556, "learning_rate": 1.216904382659806e-06, "loss": 0.0012, "num_tokens": 36688872.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8575908503966058, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.04325729329138994, "learning_rate": 1.2138271320758853e-06, "loss": 0.0017, "num_tokens": 36695377.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8577753182069728, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.08389021502807736, "learning_rate": 1.2107535258067428e-06, "loss": 0.0034, "num_tokens": 36705423.0, "reward": 1.7720588445663452, "reward_std": 0.3645668923854828, "rewards/fixed_code_pass_all_test_reward/mean": 0.8970588445663452, "rewards/fixed_code_pass_all_test_reward/std": 0.1931859403848648, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.85795978601734, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.07084864983335137, "learning_rate": 1.2076835651272446e-06, "loss": 0.0028, "num_tokens": 36714201.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8581442538277071, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.04345706268213689, "learning_rate": 1.2046172513107424e-06, "loss": 0.0017, "num_tokens": 36718373.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 393.375, "completions/mean_terminated_length": 393.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8583287216380742, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.06496494775637984, "learning_rate": 1.2015545856290767e-06, "loss": 0.0026, "num_tokens": 36724456.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 177.625, "completions/mean_terminated_length": 177.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8585131894484412, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.04552779148798436, "learning_rate": 1.1984955693525791e-06, "loss": 0.0018, "num_tokens": 36731437.0, "reward": 1.6612902879714966, "reward_std": 0.4486406445503235, "rewards/fixed_code_pass_all_test_reward/mean": 0.6612902879714966, "rewards/fixed_code_pass_all_test_reward/std": 0.4486406445503235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 275.625, "completions/mean_terminated_length": 275.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8586976572588083, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.03113115462474525, "learning_rate": 1.19544020375006e-06, "loss": 0.0012, "num_tokens": 36738042.0, "reward": 1.8189655542373657, "reward_std": 0.31252652406692505, "rewards/fixed_code_pass_all_test_reward/mean": 0.818965494632721, "rewards/fixed_code_pass_all_test_reward/std": 0.31252655386924744, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8588821250691754, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.07863293448463082, "learning_rate": 1.1923884900888205e-06, "loss": 0.0031, "num_tokens": 36742049.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 490.0, "completions/mean_terminated_length": 490.0, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.8590665928795426, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.044750952161848545, "learning_rate": 1.1893404296346422e-06, "loss": 0.0018, "num_tokens": 36754073.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.8592510606899096, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.043881861260160804, "learning_rate": 1.1862960236518006e-06, "loss": 0.0018, "num_tokens": 36759683.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 408.75, "completions/mean_terminated_length": 408.75, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.8594355285002767, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.026635699323378503, "learning_rate": 1.1832552734030467e-06, "loss": 0.0011, "num_tokens": 36767225.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.8596199963106438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.036406802013516426, "learning_rate": 1.1802181801496172e-06, "loss": 0.0015, "num_tokens": 36773155.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8598044641210109, "frac_reward_zero_std": 1.0, "grad_norm": 0.3046875, "kl": 0.10240048542618752, "learning_rate": 1.177184745151234e-06, "loss": 0.0041, "num_tokens": 36782284.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 297.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.8599889319313779, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.03224387764930725, "learning_rate": 1.174154969666097e-06, "loss": 0.0013, "num_tokens": 36791730.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 279.0, "completions/mean_terminated_length": 279.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.8601733997417451, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.04444386763498187, "learning_rate": 1.171128854950897e-06, "loss": 0.0018, "num_tokens": 36801794.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 316.625, "completions/mean_terminated_length": 316.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.8603578675521122, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.048996549332514405, "learning_rate": 1.1681064022607991e-06, "loss": 0.002, "num_tokens": 36807927.0, "reward": 1.734375, "reward_std": 0.7021570801734924, "rewards/fixed_code_pass_all_test_reward/mean": 0.859375, "rewards/fixed_code_pass_all_test_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.8605423353624793, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.06045589363202453, "learning_rate": 1.1650876128494504e-06, "loss": 0.0024, "num_tokens": 36817244.0, "reward": 1.75, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8607268031728463, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.04053108184598386, "learning_rate": 1.1620724879689793e-06, "loss": 0.0016, "num_tokens": 36823730.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.8609112709832134, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.07871938147582114, "learning_rate": 1.1590610288699955e-06, "loss": 0.0031, "num_tokens": 36833007.0, "reward": 1.609375, "reward_std": 0.4317220449447632, "rewards/fixed_code_pass_all_test_reward/mean": 0.609375, "rewards/fixed_code_pass_all_test_reward/std": 0.4317220449447632, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.8610957387935805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0439453125, "kl": 0.08020176505669951, "learning_rate": 1.1560532368015886e-06, "loss": 0.0032, "num_tokens": 36843582.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8612802066039477, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.058967253658920527, "learning_rate": 1.1530491130113231e-06, "loss": 0.0024, "num_tokens": 36852245.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.8614646744143147, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.05459604552015662, "learning_rate": 1.1500486587452453e-06, "loss": 0.0022, "num_tokens": 36857716.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 225.25, "completions/mean_terminated_length": 225.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8616491422246818, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.06562755675986409, "learning_rate": 1.1470518752478787e-06, "loss": 0.0026, "num_tokens": 36865974.0, "reward": 1.78515625, "reward_std": 0.011048543266952038, "rewards/fixed_code_pass_all_test_reward/mean": 0.78515625, "rewards/fixed_code_pass_all_test_reward/std": 0.011048543266952038, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 537.125, "completions/mean_terminated_length": 537.125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.8618336100350489, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.03776209650095552, "learning_rate": 1.1440587637622259e-06, "loss": 0.0015, "num_tokens": 36881519.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.862018077845416, "frac_reward_zero_std": 1.0, "grad_norm": 0.037841796875, "kl": 0.02330950798932463, "learning_rate": 1.1410693255297644e-06, "loss": 0.0009, "num_tokens": 36889943.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 308.625, "completions/mean_terminated_length": 308.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.862202545655783, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.05302603053860366, "learning_rate": 1.1380835617904485e-06, "loss": 0.0021, "num_tokens": 36896876.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 158.75, "completions/mean_terminated_length": 158.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8623870134661502, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.03254163917154074, "learning_rate": 1.135101473782706e-06, "loss": 0.0013, "num_tokens": 36900914.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 259.875, "completions/mean_terminated_length": 259.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.8625714812765173, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.04129322391236201, "learning_rate": 1.1321230627434465e-06, "loss": 0.0017, "num_tokens": 36906673.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.8627559490868844, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.04001817409880459, "learning_rate": 1.1291483299080496e-06, "loss": 0.0016, "num_tokens": 36915157.0, "reward": 1.2569444179534912, "reward_std": 0.06371898204088211, "rewards/fixed_code_pass_all_test_reward/mean": 0.2569444477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.0637190118432045, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8629404168972514, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.05465233407448977, "learning_rate": 1.1261772765103685e-06, "loss": 0.0022, "num_tokens": 36921650.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8631248847076185, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06783878616988659, "learning_rate": 1.1232099037827327e-06, "loss": 0.0027, "num_tokens": 36928012.0, "reward": 1.7777777910232544, "reward_std": 0.4157397150993347, "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.4157397150993347, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.8633093525179856, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "kl": 0.09485160745680332, "learning_rate": 1.1202462129559455e-06, "loss": 0.0038, "num_tokens": 36931933.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 399.25, "completions/mean_terminated_length": 399.25, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.8634938203283528, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.03687676275148988, "learning_rate": 1.1172862052592814e-06, "loss": 0.0015, "num_tokens": 36941311.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 289.875, "completions/mean_terminated_length": 289.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8636782881387198, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.07044239668175578, "learning_rate": 1.1143298819204874e-06, "loss": 0.0028, "num_tokens": 36950710.0, "reward": 1.8854167461395264, "reward_std": 0.3240906000137329, "rewards/fixed_code_pass_all_test_reward/mean": 0.8854166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 312.125, "completions/mean_terminated_length": 312.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8638627559490869, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.02942833339329809, "learning_rate": 1.11137724416578e-06, "loss": 0.0012, "num_tokens": 36957791.0, "reward": 1.7727272510528564, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7727272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 454.5, "completions/mean_terminated_length": 454.5, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.864047223759454, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.04610479390248656, "learning_rate": 1.1084282932198543e-06, "loss": 0.0018, "num_tokens": 36969323.0, "reward": 1.8636364936828613, "reward_std": 0.16116461157798767, "rewards/fixed_code_pass_all_test_reward/mean": 0.8636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.16116458177566528, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 298.625, "completions/mean_terminated_length": 298.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.864231691569821, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.07889191526919603, "learning_rate": 1.1054830303058673e-06, "loss": 0.0032, "num_tokens": 36978288.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 489.375, "completions/mean_terminated_length": 489.375, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.8644161593801881, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.024903807090595365, "learning_rate": 1.102541456645455e-06, "loss": 0.001, "num_tokens": 36989251.0, "reward": 1.8645833730697632, "reward_std": 0.3505593538284302, "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.35055938363075256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 207.125, "completions/mean_terminated_length": 207.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.8646006271905552, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.07153603760525584, "learning_rate": 1.099603573458715e-06, "loss": 0.0029, "num_tokens": 36993660.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 351.625, "completions/mean_terminated_length": 351.625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.8647850950009224, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.01951608620584011, "learning_rate": 1.0966693819642204e-06, "loss": 0.0008, "num_tokens": 37001601.0, "reward": 1.5374999046325684, "reward_std": 0.3889087438583374, "rewards/fixed_code_pass_all_test_reward/mean": 0.5375000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.388908714056015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 369.75, "completions/mean_terminated_length": 369.75, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8649695628112894, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0586513327434659, "learning_rate": 1.0937388833790052e-06, "loss": 0.0023, "num_tokens": 37009239.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.8651540306216565, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.08273435360752046, "learning_rate": 1.090812078918584e-06, "loss": 0.0033, "num_tokens": 37013648.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 276.25, "completions/mean_terminated_length": 276.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8653384984320236, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.051867523696273565, "learning_rate": 1.0878889697969285e-06, "loss": 0.0021, "num_tokens": 37023586.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 155.625, "completions/mean_terminated_length": 155.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8655229662423907, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.055994220077991486, "learning_rate": 1.0849695572264807e-06, "loss": 0.0022, "num_tokens": 37027607.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.8657074340527577, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.047270068898797035, "learning_rate": 1.0820538424181514e-06, "loss": 0.0019, "num_tokens": 37031915.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 518.25, "completions/mean_terminated_length": 518.25, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.8658919018631249, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.03857706603594124, "learning_rate": 1.0791418265813137e-06, "loss": 0.0015, "num_tokens": 37041925.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 201.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.866076369673492, "frac_reward_zero_std": 1.0, "grad_norm": 0.2099609375, "kl": 0.03774711478035897, "learning_rate": 1.076233510923812e-06, "loss": 0.0015, "num_tokens": 37047530.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.8662608374838591, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.04767307383008301, "learning_rate": 1.0733288966519517e-06, "loss": 0.0019, "num_tokens": 37051798.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.8664453052942261, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "kl": 0.04979564202949405, "learning_rate": 1.0704279849705035e-06, "loss": 0.002, "num_tokens": 37057058.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.8666297731045932, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.04828170337714255, "learning_rate": 1.0675307770827025e-06, "loss": 0.0019, "num_tokens": 37065184.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8668142409149603, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.08064281847327948, "learning_rate": 1.06463727419025e-06, "loss": 0.0032, "num_tokens": 37074132.0, "reward": 1.994949460029602, "reward_std": 0.009351746179163456, "rewards/fixed_code_pass_all_test_reward/mean": 0.994949460029602, "rewards/fixed_code_pass_all_test_reward/std": 0.009351727552711964, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 584.625, "completions/mean_terminated_length": 584.625, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.8669987087253275, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.02994975820183754, "learning_rate": 1.0617474774933068e-06, "loss": 0.0012, "num_tokens": 37085329.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8671831765356945, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.11200269963592291, "learning_rate": 1.0588613881904986e-06, "loss": 0.0045, "num_tokens": 37093138.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 235.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8673676443460616, "frac_reward_zero_std": 1.0, "grad_norm": 0.051513671875, "kl": 0.026341847609728575, "learning_rate": 1.0559790074789134e-06, "loss": 0.0011, "num_tokens": 37098368.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8675521121564287, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.06349189765751362, "learning_rate": 1.0531003365540981e-06, "loss": 0.0025, "num_tokens": 37107073.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8677365799667958, "frac_reward_zero_std": 1.0, "grad_norm": 0.2158203125, "kl": 0.045797620667144656, "learning_rate": 1.0502253766100666e-06, "loss": 0.0018, "num_tokens": 37111933.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.8679210477771628, "frac_reward_zero_std": 1.0, "grad_norm": 1.765625, "kl": 0.18169233249500394, "learning_rate": 1.0473541288392885e-06, "loss": 0.0073, "num_tokens": 37120050.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 545.875, "completions/mean_terminated_length": 545.875, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.86810551558753, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.02074729942250997, "learning_rate": 1.0444865944326966e-06, "loss": 0.0008, "num_tokens": 37129353.0, "reward": 1.9285714626312256, "reward_std": 0.2020304799079895, "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8682899833978971, "frac_reward_zero_std": 1.0, "grad_norm": 0.23828125, "kl": 0.0730995717458427, "learning_rate": 1.0416227745796792e-06, "loss": 0.0029, "num_tokens": 37133232.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.8684744512082642, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.060281526762992144, "learning_rate": 1.0387626704680897e-06, "loss": 0.0024, "num_tokens": 37142260.0, "reward": 1.2249999046325684, "reward_std": 0.13887299597263336, "rewards/fixed_code_pass_all_test_reward/mean": 0.22500000894069672, "rewards/fixed_code_pass_all_test_reward/std": 0.13887302577495575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 170.375, "completions/mean_terminated_length": 170.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8686589190186312, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.13119741808623075, "learning_rate": 1.0359062832842381e-06, "loss": 0.0052, "num_tokens": 37146463.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 429.375, "completions/mean_terminated_length": 429.375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.8688433868289983, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.04348164121620357, "learning_rate": 1.0330536142128911e-06, "loss": 0.0017, "num_tokens": 37158826.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 506.25, "completions/mean_terminated_length": 506.25, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.8690278546393654, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.05149833671748638, "learning_rate": 1.0302046644372709e-06, "loss": 0.0021, "num_tokens": 37172020.0, "reward": 1.3333333730697632, "reward_std": 0.4364357590675354, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 224.125, "completions/mean_terminated_length": 224.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8692123224497326, "frac_reward_zero_std": 1.0, "grad_norm": 0.050537109375, "kl": 0.028032814501784742, "learning_rate": 1.0273594351390659e-06, "loss": 0.0011, "num_tokens": 37184109.0, "reward": 1.5306122303009033, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5306122303009033, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 312.625, "completions/mean_terminated_length": 312.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8693967902600996, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.0360534000210464, "learning_rate": 1.0245179274984141e-06, "loss": 0.0014, "num_tokens": 37190130.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 164.125, "completions/mean_terminated_length": 164.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.8695812580704667, "frac_reward_zero_std": 0.0, "grad_norm": 5.15625, "kl": 0.3571891435422003, "learning_rate": 1.0216801426939094e-06, "loss": 0.0143, "num_tokens": 37194291.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.8697657258808338, "frac_reward_zero_std": 1.0, "grad_norm": 0.224609375, "kl": 0.07509839767590165, "learning_rate": 1.0188460819026058e-06, "loss": 0.003, "num_tokens": 37201568.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8699501936912009, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.04467900609597564, "learning_rate": 1.0160157463000075e-06, "loss": 0.0018, "num_tokens": 37207613.0, "reward": 1.46875, "reward_std": 0.38816189765930176, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.8701346615015679, "frac_reward_zero_std": 1.0, "grad_norm": 0.46484375, "kl": 0.0720891822129488, "learning_rate": 1.013189137060081e-06, "loss": 0.0029, "num_tokens": 37212061.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8703191293119351, "frac_reward_zero_std": 0.0, "grad_norm": 3.28125, "kl": 0.09442901751026511, "learning_rate": 1.0103662553552407e-06, "loss": 0.0038, "num_tokens": 37216606.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.8705035971223022, "frac_reward_zero_std": 1.0, "grad_norm": 0.3125, "kl": 0.06805178569629788, "learning_rate": 1.0075471023563566e-06, "loss": 0.0027, "num_tokens": 37224676.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 441.875, "completions/mean_terminated_length": 441.875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.8706880649326693, "frac_reward_zero_std": 1.0, "grad_norm": 0.396484375, "kl": 0.043776882463134825, "learning_rate": 1.00473167923275e-06, "loss": 0.0018, "num_tokens": 37233515.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.8708725327430363, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.027248687227256596, "learning_rate": 1.0019199871522034e-06, "loss": 0.0011, "num_tokens": 37240472.0, "reward": 1.9017857313156128, "reward_std": 0.18185748159885406, "rewards/fixed_code_pass_all_test_reward/mean": 0.9017857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.18185752630233765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8710570005534034, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.0564593062736094, "learning_rate": 9.991120272809418e-07, "loss": 0.0023, "num_tokens": 37246281.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.8712414683637705, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.04928487236611545, "learning_rate": 9.96307800783647e-07, "loss": 0.002, "num_tokens": 37250314.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 567.375, "completions/mean_terminated_length": 567.375, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.8714259361741377, "frac_reward_zero_std": 0.0, "grad_norm": 0.68359375, "kl": 0.017013753880746663, "learning_rate": 9.935073088234525e-07, "loss": 0.0007, "num_tokens": 37261437.0, "reward": 1.3125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8716104039845047, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.054946138989180326, "learning_rate": 9.90710552561942e-07, "loss": 0.0022, "num_tokens": 37267695.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8717948717948718, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.06721562962047756, "learning_rate": 9.879175331591472e-07, "loss": 0.0027, "num_tokens": 37271940.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.8719793396052389, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.06522265309467912, "learning_rate": 9.85128251773556e-07, "loss": 0.0026, "num_tokens": 37278523.0, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.41052016615867615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 265.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.872163807415606, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.04116551694460213, "learning_rate": 9.823427095620997e-07, "loss": 0.0016, "num_tokens": 37285380.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 348.75, "completions/mean_terminated_length": 348.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.872348275225973, "frac_reward_zero_std": 1.0, "grad_norm": 0.05322265625, "kl": 0.042931410716846585, "learning_rate": 9.795609076801627e-07, "loss": 0.0017, "num_tokens": 37291898.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.8725327430363402, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.05325185914989561, "learning_rate": 9.767828472815722e-07, "loss": 0.0021, "num_tokens": 37300474.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 368.375, "completions/mean_terminated_length": 368.375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8727172108467073, "frac_reward_zero_std": 1.0, "grad_norm": 0.05029296875, "kl": 0.0396317879203707, "learning_rate": 9.740085295186131e-07, "loss": 0.0016, "num_tokens": 37310189.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 312.75, "completions/mean_terminated_length": 312.75, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.8729016786570744, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.07347585633397102, "learning_rate": 9.712379555420092e-07, "loss": 0.0029, "num_tokens": 37319475.0, "reward": 1.98046875, "reward_std": 0.055242717266082764, "rewards/fixed_code_pass_all_test_reward/mean": 0.98046875, "rewards/fixed_code_pass_all_test_reward/std": 0.05524272099137306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.8730861464674414, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.0534189383033663, "learning_rate": 9.684711265009361e-07, "loss": 0.0021, "num_tokens": 37327140.0, "reward": 1.4345238208770752, "reward_std": 0.468258798122406, "rewards/fixed_code_pass_all_test_reward/mean": 0.4345238208770752, "rewards/fixed_code_pass_all_test_reward/std": 0.468258798122406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 381.875, "completions/mean_terminated_length": 381.875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.8732706142778085, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.07221530051901937, "learning_rate": 9.657080435430133e-07, "loss": 0.0029, "num_tokens": 37337875.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8734550820881756, "frac_reward_zero_std": 1.0, "grad_norm": 0.05908203125, "kl": 0.03966838726773858, "learning_rate": 9.629487078143086e-07, "loss": 0.0016, "num_tokens": 37345129.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8736395498985428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.03568508988246322, "learning_rate": 9.601931204593362e-07, "loss": 0.0014, "num_tokens": 37350396.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 326.25, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8738240177089098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.026511800824664533, "learning_rate": 9.574412826210521e-07, "loss": 0.0011, "num_tokens": 37357814.0, "reward": 1.2727272510528564, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 502.75, "completions/mean_terminated_length": 502.75, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.8740084855192769, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.03305198124144226, "learning_rate": 9.546931954408622e-07, "loss": 0.0013, "num_tokens": 37367268.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.874192953329644, "frac_reward_zero_std": 1.0, "grad_norm": 0.2060546875, "kl": 0.06933957198634744, "learning_rate": 9.519488600586091e-07, "loss": 0.0028, "num_tokens": 37375049.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.874377421140011, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.040794689557515085, "learning_rate": 9.492082776125877e-07, "loss": 0.0016, "num_tokens": 37384317.0, "reward": 1.6746987104415894, "reward_std": 0.14342626929283142, "rewards/fixed_code_pass_all_test_reward/mean": 0.6746988296508789, "rewards/fixed_code_pass_all_test_reward/std": 0.14342626929283142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 331.25, "completions/mean_terminated_length": 331.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8745618889503781, "frac_reward_zero_std": 1.0, "grad_norm": 0.0400390625, "kl": 0.02361625654157251, "learning_rate": 9.464714492395322e-07, "loss": 0.0009, "num_tokens": 37391775.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.8747463567607453, "frac_reward_zero_std": 1.0, "grad_norm": 0.046630859375, "kl": 0.01978562888689339, "learning_rate": 9.437383760746188e-07, "loss": 0.0008, "num_tokens": 37397815.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.8749308245711124, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.06176711991429329, "learning_rate": 9.410090592514653e-07, "loss": 0.0025, "num_tokens": 37406497.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.8751152923814794, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.08261024160310626, "learning_rate": 9.382834999021373e-07, "loss": 0.0033, "num_tokens": 37411744.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 234.875, "completions/mean_terminated_length": 234.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8752997601918465, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05201123142614961, "learning_rate": 9.35561699157137e-07, "loss": 0.0021, "num_tokens": 37420735.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 328.125, "completions/mean_terminated_length": 328.125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.8754842280022136, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.03594223130494356, "learning_rate": 9.32843658145407e-07, "loss": 0.0014, "num_tokens": 37433160.0, "reward": 1.9576447010040283, "reward_std": 0.02666577883064747, "rewards/fixed_code_pass_all_test_reward/mean": 0.9576446413993835, "rewards/fixed_code_pass_all_test_reward/std": 0.026665804907679558, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 259.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.8756686958125807, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.059883408015593886, "learning_rate": 9.301293779943321e-07, "loss": 0.0024, "num_tokens": 37441964.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 443.375, "completions/mean_terminated_length": 443.375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.8758531636229479, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.05471945181488991, "learning_rate": 9.274188598297373e-07, "loss": 0.0022, "num_tokens": 37452895.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 393.25, "completions/mean_terminated_length": 393.25, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.8760376314333149, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.11690994864329696, "learning_rate": 9.247121047758889e-07, "loss": 0.0047, "num_tokens": 37460193.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 361.25, "completions/mean_terminated_length": 361.25, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.876222099243682, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.04282317194156349, "learning_rate": 9.220091139554888e-07, "loss": 0.0017, "num_tokens": 37470147.0, "reward": 1.162109375, "reward_std": 0.24262812733650208, "rewards/fixed_code_pass_all_test_reward/mean": 0.162109375, "rewards/fixed_code_pass_all_test_reward/std": 0.24262814223766327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8764065670540491, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "kl": 0.12940824450924993, "learning_rate": 9.19309888489679e-07, "loss": 0.0052, "num_tokens": 37482238.0, "reward": 1.4642857313156128, "reward_std": 0.06612997502088547, "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.06613001972436905, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8765910348644161, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.056217416655272245, "learning_rate": 9.166144294980384e-07, "loss": 0.0022, "num_tokens": 37489078.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 245.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8767755026747832, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.06159643759019673, "learning_rate": 9.139227380985893e-07, "loss": 0.0025, "num_tokens": 37498252.0, "reward": 1.7222222089767456, "reward_std": 0.5796368718147278, "rewards/fixed_code_pass_all_test_reward/mean": 0.8472222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.2828894853591919, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.8769599704851503, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0636623241007328, "learning_rate": 9.112348154077854e-07, "loss": 0.0025, "num_tokens": 37508186.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 338.5, "completions/mean_terminated_length": 338.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8771444382955175, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.029872987186536193, "learning_rate": 9.085506625405182e-07, "loss": 0.0012, "num_tokens": 37519870.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8773289061058845, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.04908198304474354, "learning_rate": 9.058702806101172e-07, "loss": 0.002, "num_tokens": 37525274.0, "reward": 1.9492924213409424, "reward_std": 0.14342260360717773, "rewards/fixed_code_pass_all_test_reward/mean": 0.9492924213409424, "rewards/fixed_code_pass_all_test_reward/std": 0.14342260360717773, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8775133739162516, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.05541549948975444, "learning_rate": 9.031936707283439e-07, "loss": 0.0022, "num_tokens": 37533373.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 482.0, "completions/mean_terminated_length": 482.0, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.8776978417266187, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.033755890326574445, "learning_rate": 9.005208340054039e-07, "loss": 0.0014, "num_tokens": 37542941.0, "reward": 1.6349999904632568, "reward_std": 0.033381011337041855, "rewards/fixed_code_pass_all_test_reward/mean": 0.6349999904632568, "rewards/fixed_code_pass_all_test_reward/std": 0.03338091820478439, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 348.25, "completions/mean_terminated_length": 348.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.8778823095369858, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.04394383239559829, "learning_rate": 8.97851771549928e-07, "loss": 0.0018, "num_tokens": 37548527.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8780667773473528, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.11526431841775775, "learning_rate": 8.951864844689872e-07, "loss": 0.0046, "num_tokens": 37558466.0, "reward": 1.8986486196517944, "reward_std": 0.10834921151399612, "rewards/fixed_code_pass_all_test_reward/mean": 0.8986486196517944, "rewards/fixed_code_pass_all_test_reward/std": 0.10834915935993195, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.87825124515772, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.04469630285166204, "learning_rate": 8.92524973868083e-07, "loss": 0.0018, "num_tokens": 37568226.0, "reward": 1.8010752201080322, "reward_std": 0.21265946328639984, "rewards/fixed_code_pass_all_test_reward/mean": 0.801075279712677, "rewards/fixed_code_pass_all_test_reward/std": 0.21265947818756104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 186.625, "completions/mean_terminated_length": 186.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8784357129680871, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.0327629498206079, "learning_rate": 8.898672408511555e-07, "loss": 0.0013, "num_tokens": 37574727.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8786201807784542, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.03763758344575763, "learning_rate": 8.87213286520573e-07, "loss": 0.0015, "num_tokens": 37584008.0, "reward": 1.399999976158142, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.8788046485888212, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.05062096333131194, "learning_rate": 8.845631119771392e-07, "loss": 0.002, "num_tokens": 37592609.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.8789891163991883, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.06866618292406201, "learning_rate": 8.819167183200905e-07, "loss": 0.0027, "num_tokens": 37607199.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.8791735842095554, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.07272018818184733, "learning_rate": 8.792741066470911e-07, "loss": 0.0029, "num_tokens": 37611615.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 333.375, "completions/mean_terminated_length": 333.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.8793580520199226, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.02357656543608755, "learning_rate": 8.766352780542397e-07, "loss": 0.0009, "num_tokens": 37617938.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 353.875, "completions/mean_terminated_length": 353.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8795425198302896, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.036851231125183403, "learning_rate": 8.740002336360687e-07, "loss": 0.0015, "num_tokens": 37624977.0, "reward": 1.4175000190734863, "reward_std": 0.4832552969455719, "rewards/fixed_code_pass_all_test_reward/mean": 0.41749998927116394, "rewards/fixed_code_pass_all_test_reward/std": 0.4832553267478943, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8797269876406567, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.035555747570469975, "learning_rate": 8.713689744855347e-07, "loss": 0.0014, "num_tokens": 37629235.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 357.25, "completions/mean_terminated_length": 357.25, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8799114554510238, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.02872679615393281, "learning_rate": 8.687415016940271e-07, "loss": 0.0011, "num_tokens": 37636765.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.8800959232613909, "frac_reward_zero_std": 0.0, "grad_norm": 3.1875, "kl": 0.08704255591146648, "learning_rate": 8.66117816351365e-07, "loss": 0.0035, "num_tokens": 37640671.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 507.375, "completions/mean_terminated_length": 507.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.8802803910717579, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.026676894398406148, "learning_rate": 8.634979195457982e-07, "loss": 0.0011, "num_tokens": 37649594.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 370.5, "completions/mean_terminated_length": 370.5, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.8804648588821251, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.02757977496366948, "learning_rate": 8.608818123640017e-07, "loss": 0.0011, "num_tokens": 37657142.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8806493266924922, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.07349559990689158, "learning_rate": 8.582694958910809e-07, "loss": 0.0029, "num_tokens": 37665167.0, "reward": 1.1764706373214722, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1764705926179886, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8808337945028593, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.0613695508800447, "learning_rate": 8.556609712105657e-07, "loss": 0.0025, "num_tokens": 37670145.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8810182623132263, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.0655340445227921, "learning_rate": 8.530562394044207e-07, "loss": 0.0026, "num_tokens": 37675493.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8812027301235934, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.05258434498682618, "learning_rate": 8.504553015530292e-07, "loss": 0.0021, "num_tokens": 37686885.0, "reward": 1.4193549156188965, "reward_std": 0.10200896859169006, "rewards/fixed_code_pass_all_test_reward/mean": 0.4193548262119293, "rewards/fixed_code_pass_all_test_reward/std": 0.10200895369052887, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8813871979339605, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.06308078370057046, "learning_rate": 8.478581587352052e-07, "loss": 0.0025, "num_tokens": 37691176.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 195.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8815716657443277, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.10439427709206939, "learning_rate": 8.45264812028187e-07, "loss": 0.0042, "num_tokens": 37699696.0, "reward": 1.9913792610168457, "reward_std": 0.015962397679686546, "rewards/fixed_code_pass_all_test_reward/mean": 0.9913793206214905, "rewards/fixed_code_pass_all_test_reward/std": 0.015962423756718636, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.8817561335546947, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.034432727727107704, "learning_rate": 8.426752625076373e-07, "loss": 0.0014, "num_tokens": 37705043.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.8819406013650618, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.06493253819644451, "learning_rate": 8.400895112476482e-07, "loss": 0.0026, "num_tokens": 37713458.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 224.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.8821250691754289, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.04666329245083034, "learning_rate": 8.375075593207316e-07, "loss": 0.0019, "num_tokens": 37718303.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 854.375, "completions/mean_terminated_length": 854.375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.882309536985796, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.02515816641971469, "learning_rate": 8.349294077978265e-07, "loss": 0.001, "num_tokens": 37737258.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 218.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.882494004796163, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.07367229601368308, "learning_rate": 8.323550577482908e-07, "loss": 0.0029, "num_tokens": 37743091.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.8826784726065302, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.04371043946594, "learning_rate": 8.297845102399161e-07, "loss": 0.0017, "num_tokens": 37748852.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.8828629404168973, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.05520176549907774, "learning_rate": 8.272177663389047e-07, "loss": 0.0022, "num_tokens": 37754667.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.8830474082272644, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.06553552811965346, "learning_rate": 8.246548271098886e-07, "loss": 0.0026, "num_tokens": 37762643.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8832318760376314, "frac_reward_zero_std": 0.0, "grad_norm": 15.0, "kl": 0.689812142867595, "learning_rate": 8.220956936159197e-07, "loss": 0.0276, "num_tokens": 37772148.0, "reward": 1.6749999523162842, "reward_std": 0.46521884202957153, "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.4652188718318939, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8834163438479985, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.0544837333727628, "learning_rate": 8.195403669184676e-07, "loss": 0.0022, "num_tokens": 37776503.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 220.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8836008116583656, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.04334323783405125, "learning_rate": 8.169888480774335e-07, "loss": 0.0017, "num_tokens": 37788200.0, "reward": 1.398809552192688, "reward_std": 0.19705890119075775, "rewards/fixed_code_pass_all_test_reward/mean": 0.398809552192688, "rewards/fixed_code_pass_all_test_reward/std": 0.19705888628959656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8837852794687328, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.048095656209625304, "learning_rate": 8.144411381511274e-07, "loss": 0.0019, "num_tokens": 37797336.0, "reward": 1.764423131942749, "reward_std": 0.4372923970222473, "rewards/fixed_code_pass_all_test_reward/mean": 0.7644230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.4372923970222473, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.8839697472790998, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.03301039827056229, "learning_rate": 8.118972381962853e-07, "loss": 0.0013, "num_tokens": 37805279.0, "reward": 1.46875, "reward_std": 0.45193037390708923, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.4519304037094116, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 377.25, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.8841542150894669, "frac_reward_zero_std": 1.0, "grad_norm": 0.039794921875, "kl": 0.02637515019159764, "learning_rate": 8.093571492680608e-07, "loss": 0.0011, "num_tokens": 37818521.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.884338682899834, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.0554076274856925, "learning_rate": 8.06820872420031e-07, "loss": 0.0022, "num_tokens": 37822818.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.884523150710201, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.049962865421548486, "learning_rate": 8.04288408704188e-07, "loss": 0.002, "num_tokens": 37826871.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.8847076185205681, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.03696225234307349, "learning_rate": 8.017597591709414e-07, "loss": 0.0015, "num_tokens": 37835013.0, "reward": 1.620833396911621, "reward_std": 0.0733279138803482, "rewards/fixed_code_pass_all_test_reward/mean": 0.6208333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.0733279138803482, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8848920863309353, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.07386987144127488, "learning_rate": 7.992349248691211e-07, "loss": 0.003, "num_tokens": 37840003.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.8850765541413024, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.07006145780906081, "learning_rate": 7.967139068459729e-07, "loss": 0.0028, "num_tokens": 37844401.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8852610219516694, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.03719422314316034, "learning_rate": 7.941967061471634e-07, "loss": 0.0015, "num_tokens": 37850285.0, "reward": 1.485576868057251, "reward_std": 0.04079460725188255, "rewards/fixed_code_pass_all_test_reward/mean": 0.48557692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.04079461842775345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 306.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8854454897620365, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.041507235961034894, "learning_rate": 7.916833238167709e-07, "loss": 0.0017, "num_tokens": 37856071.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8856299575724036, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.04693873994983733, "learning_rate": 7.891737608972927e-07, "loss": 0.0019, "num_tokens": 37863705.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8858144253827707, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0549838631413877, "learning_rate": 7.866680184296405e-07, "loss": 0.0022, "num_tokens": 37871175.0, "reward": 1.4732142686843872, "reward_std": 0.9092876315116882, "rewards/fixed_code_pass_all_test_reward/mean": 0.7232142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.44637754559516907, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8859988931931378, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.05151062598451972, "learning_rate": 7.841660974531418e-07, "loss": 0.0021, "num_tokens": 37881034.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 447.375, "completions/mean_terminated_length": 447.375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.8861833610035049, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.03480949578806758, "learning_rate": 7.816679990055431e-07, "loss": 0.0014, "num_tokens": 37888957.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.886367828813872, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.05461081117391586, "learning_rate": 7.791737241229979e-07, "loss": 0.0022, "num_tokens": 37896883.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 342.0, "completions/mean_terminated_length": 342.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.8865522966242391, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.07959333388134837, "learning_rate": 7.766832738400798e-07, "loss": 0.0032, "num_tokens": 37910651.0, "reward": 1.6749999523162842, "reward_std": 0.4527692496776581, "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.45276927947998047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 443.875, "completions/mean_terminated_length": 443.875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.8867367644346061, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.03449374728370458, "learning_rate": 7.741966491897701e-07, "loss": 0.0014, "num_tokens": 37919434.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 313.125, "completions/mean_terminated_length": 313.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8869212322449732, "frac_reward_zero_std": 1.0, "grad_norm": 0.04736328125, "kl": 0.02712263714056462, "learning_rate": 7.717138512034728e-07, "loss": 0.0011, "num_tokens": 37930411.0, "reward": 1.5056179761886597, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5056179761886597, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 370.25, "completions/mean_terminated_length": 370.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8871057000553404, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.04228840011637658, "learning_rate": 7.69234880910995e-07, "loss": 0.0017, "num_tokens": 37940533.0, "reward": 1.8291666507720947, "reward_std": 0.31644728779792786, "rewards/fixed_code_pass_all_test_reward/mean": 0.8291666507720947, "rewards/fixed_code_pass_all_test_reward/std": 0.31644728779792786, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 848.875, "completions/mean_terminated_length": 848.875, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.8872901678657075, "frac_reward_zero_std": 0.0, "grad_norm": 0.640625, "kl": 0.015827878436539322, "learning_rate": 7.667597393405602e-07, "loss": 0.0006, "num_tokens": 37959452.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 95.125, "completions/mean_terminated_length": 95.125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.8874746356760745, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.06298294523730874, "learning_rate": 7.642884275188045e-07, "loss": 0.0025, "num_tokens": 37962885.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.8876591034864416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.055452675092965364, "learning_rate": 7.618209464707737e-07, "loss": 0.0022, "num_tokens": 37971718.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.8878435712968087, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.05864648334681988, "learning_rate": 7.59357297219927e-07, "loss": 0.0023, "num_tokens": 37983260.0, "reward": 1.6875, "reward_std": 0.22603380680084229, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.2260337918996811, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.8880280391071758, "frac_reward_zero_std": 1.0, "grad_norm": 0.337890625, "kl": 0.047352123889140785, "learning_rate": 7.568974807881335e-07, "loss": 0.0019, "num_tokens": 37991544.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.8882125069175429, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.05898207821883261, "learning_rate": 7.544414981956694e-07, "loss": 0.0024, "num_tokens": 37998804.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.88839697472791, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.04111054673558101, "learning_rate": 7.519893504612241e-07, "loss": 0.0016, "num_tokens": 38004844.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 423.0, "completions/mean_terminated_length": 423.0, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.8885814425382771, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.04734131461009383, "learning_rate": 7.495410386018964e-07, "loss": 0.0019, "num_tokens": 38013636.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8887659103486442, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.05877943616360426, "learning_rate": 7.470965636331939e-07, "loss": 0.0024, "num_tokens": 38019401.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.8889503781590112, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.05935477185994387, "learning_rate": 7.446559265690312e-07, "loss": 0.0024, "num_tokens": 38030457.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8891348459693783, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.03193484141957015, "learning_rate": 7.422191284217306e-07, "loss": 0.0013, "num_tokens": 38035105.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8893193137797454, "frac_reward_zero_std": 1.0, "grad_norm": 0.234375, "kl": 0.14234527200460434, "learning_rate": 7.397861702020237e-07, "loss": 0.0057, "num_tokens": 38039170.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 284.875, "completions/mean_terminated_length": 284.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8895037815901126, "frac_reward_zero_std": 1.0, "grad_norm": 0.23046875, "kl": 0.0838482475373894, "learning_rate": 7.373570529190499e-07, "loss": 0.0034, "num_tokens": 38048505.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.8896882494004796, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.08953950880095363, "learning_rate": 7.349317775803555e-07, "loss": 0.0036, "num_tokens": 38052923.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.8898727172108467, "frac_reward_zero_std": 1.0, "grad_norm": 0.5625, "kl": 0.07633126573637128, "learning_rate": 7.325103451918913e-07, "loss": 0.0031, "num_tokens": 38057185.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.8900571850212138, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.04803755052853376, "learning_rate": 7.30092756758013e-07, "loss": 0.0019, "num_tokens": 38065520.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 128.25, "completions/mean_terminated_length": 128.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8902416528315809, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.053564819041639566, "learning_rate": 7.276790132814881e-07, "loss": 0.0021, "num_tokens": 38069378.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8904261206419479, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.05333579145371914, "learning_rate": 7.252691157634839e-07, "loss": 0.0021, "num_tokens": 38079502.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 210.125, "completions/mean_terminated_length": 210.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8906105884523151, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.06405117129907012, "learning_rate": 7.228630652035717e-07, "loss": 0.0026, "num_tokens": 38085079.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 282.0, "completions/mean_terminated_length": 282.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8907950562626822, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.06966111855581403, "learning_rate": 7.204608625997323e-07, "loss": 0.0028, "num_tokens": 38093951.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8909795240730493, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.050567882135510445, "learning_rate": 7.180625089483439e-07, "loss": 0.002, "num_tokens": 38100919.0, "reward": 1.6363637447357178, "reward_std": 0.21731430292129517, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.21731428802013397, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8911639918834163, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.06633840431459248, "learning_rate": 7.156680052441955e-07, "loss": 0.0027, "num_tokens": 38110023.0, "reward": 1.7540322542190552, "reward_std": 0.45552536845207214, "rewards/fixed_code_pass_all_test_reward/mean": 0.7540322542190552, "rewards/fixed_code_pass_all_test_reward/std": 0.45552536845207214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 107.875, "completions/mean_terminated_length": 107.875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.8913484596937834, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.0889415720012039, "learning_rate": 7.132773524804748e-07, "loss": 0.0036, "num_tokens": 38117558.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8915329275041505, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.07324084034189582, "learning_rate": 7.108905516487718e-07, "loss": 0.0029, "num_tokens": 38124379.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 344.875, "completions/mean_terminated_length": 344.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8917173953145177, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.03785091149620712, "learning_rate": 7.085076037390781e-07, "loss": 0.0015, "num_tokens": 38129858.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 222.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8919018631248847, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.031394826830364764, "learning_rate": 7.061285097397929e-07, "loss": 0.0013, "num_tokens": 38135551.0, "reward": 1.6416666507720947, "reward_std": 0.2931763231754303, "rewards/fixed_code_pass_all_test_reward/mean": 0.6416666507720947, "rewards/fixed_code_pass_all_test_reward/std": 0.2931763827800751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.8920863309352518, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.05498857144266367, "learning_rate": 7.037532706377126e-07, "loss": 0.0022, "num_tokens": 38141221.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8922707987456189, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.06600506231188774, "learning_rate": 7.013818874180323e-07, "loss": 0.0026, "num_tokens": 38149315.0, "reward": 1.9736841917037964, "reward_std": 0.048727408051490784, "rewards/fixed_code_pass_all_test_reward/mean": 0.9736841917037964, "rewards/fixed_code_pass_all_test_reward/std": 0.048727381974458694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.892455266555986, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.051881683990359306, "learning_rate": 6.990143610643518e-07, "loss": 0.0021, "num_tokens": 38153353.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.892639734366353, "frac_reward_zero_std": 1.0, "grad_norm": 0.40625, "kl": 0.15278680250048637, "learning_rate": 6.966506925586702e-07, "loss": 0.0061, "num_tokens": 38157172.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8928242021767202, "frac_reward_zero_std": 1.0, "grad_norm": 0.061279296875, "kl": 0.03461580281145871, "learning_rate": 6.942908828813877e-07, "loss": 0.0014, "num_tokens": 38161639.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.8930086699870873, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.06111397850327194, "learning_rate": 6.919349330113001e-07, "loss": 0.0024, "num_tokens": 38171341.0, "reward": 1.8021653890609741, "reward_std": 0.3829219341278076, "rewards/fixed_code_pass_all_test_reward/mean": 0.8021653890609741, "rewards/fixed_code_pass_all_test_reward/std": 0.3829219341278076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8931931377974544, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.044698982615955174, "learning_rate": 6.895828439256058e-07, "loss": 0.0018, "num_tokens": 38176120.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 476.375, "completions/mean_terminated_length": 476.375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.8933776056078214, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.02826471161097288, "learning_rate": 6.872346165998978e-07, "loss": 0.0011, "num_tokens": 38185851.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 362.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8935620734181885, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.0739243645220995, "learning_rate": 6.848902520081724e-07, "loss": 0.003, "num_tokens": 38195176.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8937465412285556, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06872143130749464, "learning_rate": 6.825497511228229e-07, "loss": 0.0027, "num_tokens": 38203950.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8939310090389228, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.07854412076994777, "learning_rate": 6.802131149146374e-07, "loss": 0.0031, "num_tokens": 38212233.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.8941154768492898, "frac_reward_zero_std": 0.0, "grad_norm": 3.484375, "kl": 0.08228482329286635, "learning_rate": 6.778803443528015e-07, "loss": 0.0033, "num_tokens": 38216272.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 243.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8942999446596569, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.05611847061663866, "learning_rate": 6.755514404048969e-07, "loss": 0.0022, "num_tokens": 38224420.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.894484412470024, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.04575460101477802, "learning_rate": 6.732264040369063e-07, "loss": 0.0018, "num_tokens": 38231421.0, "reward": 1.9375, "reward_std": 0.14078859984874725, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.14078859984874725, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.894668880280391, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.08307823911309242, "learning_rate": 6.709052362132007e-07, "loss": 0.0033, "num_tokens": 38238507.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 204.125, "completions/mean_terminated_length": 204.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.8948533480907581, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.05568052316084504, "learning_rate": 6.685879378965532e-07, "loss": 0.0022, "num_tokens": 38248268.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 341.5, "completions/mean_terminated_length": 341.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8950378159011253, "frac_reward_zero_std": 1.0, "grad_norm": 0.056396484375, "kl": 0.04511105548590422, "learning_rate": 6.662745100481272e-07, "loss": 0.0018, "num_tokens": 38257264.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 232.875, "completions/mean_terminated_length": 232.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.8952222837114924, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.031540483701974154, "learning_rate": 6.639649536274817e-07, "loss": 0.0013, "num_tokens": 38263095.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.8954067515218594, "frac_reward_zero_std": 1.0, "grad_norm": 1.8984375, "kl": 0.1464371953625232, "learning_rate": 6.616592695925749e-07, "loss": 0.0059, "num_tokens": 38268876.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8955912193322265, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.04101904947310686, "learning_rate": 6.59357458899752e-07, "loss": 0.0016, "num_tokens": 38275126.0, "reward": 1.1556122303009033, "reward_std": 0.2669689357280731, "rewards/fixed_code_pass_all_test_reward/mean": 0.15561224520206451, "rewards/fixed_code_pass_all_test_reward/std": 0.26696890592575073, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8957756871425936, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.024422284099273384, "learning_rate": 6.570595225037546e-07, "loss": 0.001, "num_tokens": 38281499.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8959601549529607, "frac_reward_zero_std": 1.0, "grad_norm": 1.0078125, "kl": 0.11132525256834924, "learning_rate": 6.547654613577148e-07, "loss": 0.0045, "num_tokens": 38290755.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 339.625, "completions/mean_terminated_length": 339.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8961446227633278, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.05878149578347802, "learning_rate": 6.524752764131648e-07, "loss": 0.0024, "num_tokens": 38301040.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.8963290905736949, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.06236823135986924, "learning_rate": 6.501889686200202e-07, "loss": 0.0025, "num_tokens": 38305815.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 376.875, "completions/mean_terminated_length": 376.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.896513558384062, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.049657820258289576, "learning_rate": 6.479065389265938e-07, "loss": 0.002, "num_tokens": 38313254.0, "reward": 1.125, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8966980261944291, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.05274714552797377, "learning_rate": 6.456279882795868e-07, "loss": 0.0021, "num_tokens": 38321619.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.8968824940047961, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.040952981100417674, "learning_rate": 6.433533176240913e-07, "loss": 0.0016, "num_tokens": 38331020.0, "reward": 1.9166667461395264, "reward_std": 0.15430331230163574, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8970669618151632, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.0449875770136714, "learning_rate": 6.410825279035959e-07, "loss": 0.0018, "num_tokens": 38338458.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.8972514296255304, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.051860393956303596, "learning_rate": 6.388156200599726e-07, "loss": 0.0021, "num_tokens": 38346978.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 380.875, "completions/mean_terminated_length": 380.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.8974358974358975, "frac_reward_zero_std": 1.0, "grad_norm": 0.0517578125, "kl": 0.028619298478588462, "learning_rate": 6.36552595033485e-07, "loss": 0.0011, "num_tokens": 38354641.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8976203652462645, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.045317001873627305, "learning_rate": 6.342934537627877e-07, "loss": 0.0018, "num_tokens": 38360518.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 500.75, "completions/mean_terminated_length": 500.75, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8978048330566316, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.06906531704589725, "learning_rate": 6.320381971849254e-07, "loss": 0.0028, "num_tokens": 38369700.0, "reward": 1.6785714626312256, "reward_std": 0.4501376748085022, "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.3657134771347046, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8979893008669987, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.06380630319472402, "learning_rate": 6.297868262353291e-07, "loss": 0.0026, "num_tokens": 38375508.0, "reward": 1.3250000476837158, "reward_std": 0.5750776529312134, "rewards/fixed_code_pass_all_test_reward/mean": 0.45000001788139343, "rewards/fixed_code_pass_all_test_reward/std": 0.2777460515499115, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 367.125, "completions/mean_terminated_length": 367.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8981737686773658, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.03883081069216132, "learning_rate": 6.275393418478171e-07, "loss": 0.0016, "num_tokens": 38381941.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.8983582364877329, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.07684756303206086, "learning_rate": 6.252957449545982e-07, "loss": 0.0031, "num_tokens": 38390314.0, "reward": 1.7972221374511719, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.9222221970558167, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8985427042981, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0640505172777921, "learning_rate": 6.230560364862692e-07, "loss": 0.0026, "num_tokens": 38403105.0, "reward": 1.4285714626312256, "reward_std": 0.3081565201282501, "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.3081565201282501, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.8987271721084671, "frac_reward_zero_std": 1.0, "grad_norm": 0.37109375, "kl": 0.07024826039560139, "learning_rate": 6.208202173718114e-07, "loss": 0.0028, "num_tokens": 38411854.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.8989116399188342, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.13296553865075111, "learning_rate": 6.185882885385952e-07, "loss": 0.0053, "num_tokens": 38420495.0, "reward": 1.1193182468414307, "reward_std": 0.048211827874183655, "rewards/fixed_code_pass_all_test_reward/mean": 0.11931818723678589, "rewards/fixed_code_pass_all_test_reward/std": 0.048211827874183655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.8990961077292012, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.06385162705555558, "learning_rate": 6.163602509123767e-07, "loss": 0.0026, "num_tokens": 38428680.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8992805755395683, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.0602701548486948, "learning_rate": 6.141361054172946e-07, "loss": 0.0024, "num_tokens": 38433022.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 382.625, "completions/mean_terminated_length": 382.625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.8994650433499355, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0851869797334075, "learning_rate": 6.119158529758817e-07, "loss": 0.0034, "num_tokens": 38440275.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 837.5, "completions/mean_terminated_length": 837.5, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.8996495111603026, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.029991287272423506, "learning_rate": 6.096994945090474e-07, "loss": 0.0012, "num_tokens": 38456671.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 241.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8998339789706696, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.0743577522225678, "learning_rate": 6.074870309360903e-07, "loss": 0.003, "num_tokens": 38465537.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 406.125, "completions/mean_terminated_length": 406.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.9000184467810367, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.07309477066155523, "learning_rate": 6.052784631746899e-07, "loss": 0.0029, "num_tokens": 38477362.0, "reward": 1.6742424964904785, "reward_std": 0.7089595794677734, "rewards/fixed_code_pass_all_test_reward/mean": 0.7992424368858337, "rewards/fixed_code_pass_all_test_reward/std": 0.3863530457019806, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 314.625, "completions/mean_terminated_length": 314.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9002029145914038, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.06580435810610652, "learning_rate": 6.030737921409169e-07, "loss": 0.0026, "num_tokens": 38484575.0, "reward": 1.3333333730697632, "reward_std": 0.35634833574295044, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.3563483655452728, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 566.75, "completions/mean_terminated_length": 566.75, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.9003873824017709, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.04166787338908762, "learning_rate": 6.008730187492184e-07, "loss": 0.0017, "num_tokens": 38498061.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.900571850212138, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.07215636363252997, "learning_rate": 5.986761439124289e-07, "loss": 0.0029, "num_tokens": 38507096.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.9007563180225051, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.02816341770812869, "learning_rate": 5.964831685417627e-07, "loss": 0.0011, "num_tokens": 38514025.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9009407858328722, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.07467985711991787, "learning_rate": 5.942940935468211e-07, "loss": 0.003, "num_tokens": 38523156.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 237.0, "completions/mean_terminated_length": 237.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.9011252536432393, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.11345319030806422, "learning_rate": 5.921089198355834e-07, "loss": 0.0045, "num_tokens": 38535356.0, "reward": 1.0818965435028076, "reward_std": 0.01219149399548769, "rewards/fixed_code_pass_all_test_reward/mean": 0.08189655095338821, "rewards/fixed_code_pass_all_test_reward/std": 0.012191496789455414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.9013097214536063, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.04256300104316324, "learning_rate": 5.899276483144145e-07, "loss": 0.0017, "num_tokens": 38544900.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.9014941892639734, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.07047823397442698, "learning_rate": 5.877502798880563e-07, "loss": 0.0028, "num_tokens": 38555904.0, "reward": 1.4602272510528564, "reward_std": 0.28792861104011536, "rewards/fixed_code_pass_all_test_reward/mean": 0.46022725105285645, "rewards/fixed_code_pass_all_test_reward/std": 0.28792861104011536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 239.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.9016786570743405, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "kl": 0.04712275601923466, "learning_rate": 5.855768154596364e-07, "loss": 0.0019, "num_tokens": 38565881.0, "reward": 1.9474637508392334, "reward_std": 0.148594930768013, "rewards/fixed_code_pass_all_test_reward/mean": 0.9474637508392334, "rewards/fixed_code_pass_all_test_reward/std": 0.1485949158668518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 180.125, "completions/mean_terminated_length": 180.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.9018631248847077, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.05127315688878298, "learning_rate": 5.83407255930657e-07, "loss": 0.0021, "num_tokens": 38570266.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9020475926950747, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.07338872319087386, "learning_rate": 5.812416022010103e-07, "loss": 0.0029, "num_tokens": 38579516.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 480.125, "completions/mean_terminated_length": 480.125, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.9022320605054418, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.03472259733825922, "learning_rate": 5.790798551689591e-07, "loss": 0.0014, "num_tokens": 38588309.0, "reward": 1.9464285373687744, "reward_std": 0.1515229046344757, "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.9024165283158089, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.04917910567019135, "learning_rate": 5.769220157311495e-07, "loss": 0.002, "num_tokens": 38597476.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 210.75, "completions/mean_terminated_length": 210.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.902600996126176, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.06125544989481568, "learning_rate": 5.747680847826076e-07, "loss": 0.0025, "num_tokens": 38603386.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 362.0, "completions/mean_terminated_length": 362.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.902785463936543, "frac_reward_zero_std": 1.0, "grad_norm": 0.1982421875, "kl": 0.05576374800875783, "learning_rate": 5.726180632167355e-07, "loss": 0.0022, "num_tokens": 38614074.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.9029699317469102, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.06382787390612066, "learning_rate": 5.704719519253188e-07, "loss": 0.0026, "num_tokens": 38621720.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 597.875, "completions/mean_terminated_length": 597.875, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.9031543995572773, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.039359122049063444, "learning_rate": 5.683297517985164e-07, "loss": 0.0016, "num_tokens": 38633375.0, "reward": 1.3958332538604736, "reward_std": 0.5034602880477905, "rewards/fixed_code_pass_all_test_reward/mean": 0.3958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.5034602880477905, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 612.0, "completions/mean_terminated_length": 612.0, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.9033388673676443, "frac_reward_zero_std": 1.0, "grad_norm": 0.026611328125, "kl": 0.013873397198040038, "learning_rate": 5.661914637248667e-07, "loss": 0.0006, "num_tokens": 38645319.0, "reward": 1.6470588445663452, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6470588445663452, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 174.5, "completions/mean_terminated_length": 174.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.9035233351780114, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.1090029003098607, "learning_rate": 5.640570885912844e-07, "loss": 0.0044, "num_tokens": 38649515.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 4898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 158.875, "completions/mean_terminated_length": 158.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9037078029883785, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.08565124962478876, "learning_rate": 5.619266272830647e-07, "loss": 0.0034, "num_tokens": 38653506.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.9038922707987456, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.06083542900159955, "learning_rate": 5.598000806838766e-07, "loss": 0.0024, "num_tokens": 38657740.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.9040767386091128, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.04057839454617351, "learning_rate": 5.576774496757631e-07, "loss": 0.0016, "num_tokens": 38669023.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.9042612064194798, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.08533969102427363, "learning_rate": 5.555587351391467e-07, "loss": 0.0034, "num_tokens": 38678190.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.9044456742298469, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.057517878245562315, "learning_rate": 5.534439379528267e-07, "loss": 0.0023, "num_tokens": 38682816.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.904630142040214, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.12932231090962887, "learning_rate": 5.513330589939736e-07, "loss": 0.0052, "num_tokens": 38689658.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.904814609850581, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.05487160291522741, "learning_rate": 5.492260991381349e-07, "loss": 0.0022, "num_tokens": 38697657.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.9049990776609481, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.05037694610655308, "learning_rate": 5.471230592592314e-07, "loss": 0.002, "num_tokens": 38705066.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 499.875, "completions/mean_terminated_length": 499.875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.9051835454713153, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.03294741082936525, "learning_rate": 5.450239402295565e-07, "loss": 0.0013, "num_tokens": 38716945.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.9053680132816824, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.043218726525083184, "learning_rate": 5.429287429197839e-07, "loss": 0.0017, "num_tokens": 38726270.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.9055524810920494, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.04311913903802633, "learning_rate": 5.408374681989547e-07, "loss": 0.0017, "num_tokens": 38732696.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 340.75, "completions/mean_terminated_length": 340.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9057369489024165, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.05165137862786651, "learning_rate": 5.387501169344833e-07, "loss": 0.0021, "num_tokens": 38743734.0, "reward": 1.0125000476837158, "reward_std": 0.035355329513549805, "rewards/fixed_code_pass_all_test_reward/mean": 0.012500000186264515, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 324.25, "completions/mean_terminated_length": 324.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9059214167127836, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.06388001097366214, "learning_rate": 5.366666899921569e-07, "loss": 0.0026, "num_tokens": 38749616.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.9061058845231507, "frac_reward_zero_std": 1.0, "grad_norm": 1.0, "kl": 0.07536604185588658, "learning_rate": 5.345871882361398e-07, "loss": 0.003, "num_tokens": 38753436.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.9062903523335178, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.036844339687377214, "learning_rate": 5.32511612528962e-07, "loss": 0.0015, "num_tokens": 38765652.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 373.5, "completions/mean_terminated_length": 373.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9064748201438849, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.052386425668373704, "learning_rate": 5.304399637315271e-07, "loss": 0.0021, "num_tokens": 38773088.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 390.0, "completions/mean_terminated_length": 390.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.906659287954252, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.04084297874942422, "learning_rate": 5.283722427031112e-07, "loss": 0.0016, "num_tokens": 38784992.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.9068437557646191, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.03934967005625367, "learning_rate": 5.263084503013594e-07, "loss": 0.0016, "num_tokens": 38789436.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 363.125, "completions/mean_terminated_length": 363.125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.9070282235749861, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.06939860573038459, "learning_rate": 5.2424858738229e-07, "loss": 0.0028, "num_tokens": 38796181.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.9072126913853532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.039963401621207595, "learning_rate": 5.221926548002876e-07, "loss": 0.0016, "num_tokens": 38800075.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 347.375, "completions/mean_terminated_length": 347.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.9073971591957204, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.04103419207967818, "learning_rate": 5.20140653408111e-07, "loss": 0.0016, "num_tokens": 38806046.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.9075816270060875, "frac_reward_zero_std": 1.0, "grad_norm": 0.7890625, "kl": 0.11517698224633932, "learning_rate": 5.180925840568829e-07, "loss": 0.0046, "num_tokens": 38814487.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9077660948164545, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.020325160468928516, "learning_rate": 5.160484475961003e-07, "loss": 0.0008, "num_tokens": 38822481.0, "reward": 1.1416666507720947, "reward_std": 0.02357027679681778, "rewards/fixed_code_pass_all_test_reward/mean": 0.14166668057441711, "rewards/fixed_code_pass_all_test_reward/std": 0.0235702246427536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.9079505626268216, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.030759482237044722, "learning_rate": 5.140082448736283e-07, "loss": 0.0012, "num_tokens": 38827994.0, "reward": 1.53125, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.53125, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 259.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9081350304371887, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.0706075974740088, "learning_rate": 5.119719767356945e-07, "loss": 0.0028, "num_tokens": 38836781.0, "reward": 1.6875, "reward_std": 0.7039429545402527, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 218.375, "completions/mean_terminated_length": 218.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.9083194982475558, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.10103574674576521, "learning_rate": 5.099396440269034e-07, "loss": 0.004, "num_tokens": 38845368.0, "reward": 1.5625, "reward_std": 0.6323143243789673, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.27998724579811096, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 418.125, "completions/mean_terminated_length": 418.125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.9085039660579229, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.04306099144741893, "learning_rate": 5.079112475902215e-07, "loss": 0.0017, "num_tokens": 38856945.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.90868843386829, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.09431128250434995, "learning_rate": 5.058867882669815e-07, "loss": 0.0038, "num_tokens": 38861115.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9088729016786571, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.054934638319537044, "learning_rate": 5.038662668968885e-07, "loss": 0.0022, "num_tokens": 38871233.0, "reward": 1.3735294342041016, "reward_std": 0.21827073395252228, "rewards/fixed_code_pass_all_test_reward/mean": 0.37352943420410156, "rewards/fixed_code_pass_all_test_reward/std": 0.21827074885368347, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 388.25, "completions/mean_terminated_length": 388.25, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.9090573694890242, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.041538787772879004, "learning_rate": 5.018496843180099e-07, "loss": 0.0017, "num_tokens": 38881603.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.9092418372993912, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.034017655183561146, "learning_rate": 4.998370413667808e-07, "loss": 0.0014, "num_tokens": 38886856.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.9094263051097583, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.05949533777311444, "learning_rate": 4.978283388780003e-07, "loss": 0.0024, "num_tokens": 38891748.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.9096107729201255, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.04840176971629262, "learning_rate": 4.958235776848374e-07, "loss": 0.0019, "num_tokens": 38895623.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 352.5, "completions/mean_terminated_length": 352.5, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9097952407304926, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.043702950002625585, "learning_rate": 4.938227586188227e-07, "loss": 0.0017, "num_tokens": 38902987.0, "reward": 1.9642857313156128, "reward_std": 0.10101523250341415, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 331.75, "completions/mean_terminated_length": 331.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.9099797085408596, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.05341605935245752, "learning_rate": 4.918258825098532e-07, "loss": 0.0021, "num_tokens": 38913601.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 402.125, "completions/mean_terminated_length": 402.125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.9101641763512267, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.052229733439162374, "learning_rate": 4.898329501861876e-07, "loss": 0.0021, "num_tokens": 38926266.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.9103486441615938, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.0434629765804857, "learning_rate": 4.878439624744558e-07, "loss": 0.0017, "num_tokens": 38936037.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 360.375, "completions/mean_terminated_length": 360.375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.9105331119719609, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.040494119515642524, "learning_rate": 4.858589201996433e-07, "loss": 0.0016, "num_tokens": 38946768.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 308.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.910717579782328, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.035676652332767844, "learning_rate": 4.838778241851028e-07, "loss": 0.0014, "num_tokens": 38955073.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 318.875, "completions/mean_terminated_length": 318.875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9109020475926951, "frac_reward_zero_std": 1.0, "grad_norm": 0.044921875, "kl": 0.046035215724259615, "learning_rate": 4.819006752525523e-07, "loss": 0.0018, "num_tokens": 38964392.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.9110865154030622, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.05929613392800093, "learning_rate": 4.799274742220672e-07, "loss": 0.0024, "num_tokens": 38972140.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9112709832134293, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.11628485610708594, "learning_rate": 4.779582219120926e-07, "loss": 0.0047, "num_tokens": 38979283.0, "reward": 1.109375, "reward_std": 0.5019493103027344, "rewards/fixed_code_pass_all_test_reward/mean": 0.234375, "rewards/fixed_code_pass_all_test_reward/std": 0.30935922265052795, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 393.25, "completions/mean_terminated_length": 393.25, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9114554510237963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.039014171343296766, "learning_rate": 4.759929191394297e-07, "loss": 0.0016, "num_tokens": 38992053.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 232.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.9116399188341634, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.04594172479119152, "learning_rate": 4.740315667192441e-07, "loss": 0.0018, "num_tokens": 38998239.0, "reward": 1.6918103694915771, "reward_std": 0.45620474219322205, "rewards/fixed_code_pass_all_test_reward/mean": 0.9418103694915771, "rewards/fixed_code_pass_all_test_reward/std": 0.1645852029323578, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 355.75, "completions/mean_terminated_length": 355.75, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.9118243866445306, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.025015136576257646, "learning_rate": 4.7207416546506067e-07, "loss": 0.001, "num_tokens": 39004981.0, "reward": 1.8229167461395264, "reward_std": 0.23332267999649048, "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.23332270979881287, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.9120088544548977, "frac_reward_zero_std": 1.0, "grad_norm": 0.41015625, "kl": 0.06182628567330539, "learning_rate": 4.7012071618877107e-07, "loss": 0.0025, "num_tokens": 39011023.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 235.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.9121933222652647, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.07924421760253608, "learning_rate": 4.681712197006205e-07, "loss": 0.0032, "num_tokens": 39020270.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.9123777900756318, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.022190924326423556, "learning_rate": 4.662256768092199e-07, "loss": 0.0009, "num_tokens": 39028324.0, "reward": 1.1416666507720947, "reward_std": 0.02357027679681778, "rewards/fixed_code_pass_all_test_reward/mean": 0.14166668057441711, "rewards/fixed_code_pass_all_test_reward/std": 0.0235702246427536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.9125622578859989, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.10299913911148906, "learning_rate": 4.6428408832153713e-07, "loss": 0.0041, "num_tokens": 39036613.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.912746725696366, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.05617269268259406, "learning_rate": 4.623464550429002e-07, "loss": 0.0022, "num_tokens": 39046744.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.9129311935067331, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.05519582633860409, "learning_rate": 4.604127777770007e-07, "loss": 0.0022, "num_tokens": 39050517.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.9131156613171002, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.044877044623717666, "learning_rate": 4.5848305732588363e-07, "loss": 0.0018, "num_tokens": 39057162.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 298.375, "completions/mean_terminated_length": 298.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.9133001291274673, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.048543372889980674, "learning_rate": 4.5655729448995656e-07, "loss": 0.0019, "num_tokens": 39063797.0, "reward": 1.6470588445663452, "reward_std": 0.49014005064964294, "rewards/fixed_code_pass_all_test_reward/mean": 0.6470588445663452, "rewards/fixed_code_pass_all_test_reward/std": 0.49014005064964294, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9134845969378343, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.05756252072751522, "learning_rate": 4.546354900679839e-07, "loss": 0.0023, "num_tokens": 39074452.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 419.625, "completions/mean_terminated_length": 419.625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.9136690647482014, "frac_reward_zero_std": 1.0, "grad_norm": 0.05712890625, "kl": 0.04609333025291562, "learning_rate": 4.5271764485708916e-07, "loss": 0.0018, "num_tokens": 39083193.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.9138535325585685, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.08480243710801005, "learning_rate": 4.5080375965275256e-07, "loss": 0.0034, "num_tokens": 39089344.0, "reward": 1.96875, "reward_std": 0.043129079043865204, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0431290864944458, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 185.5, "completions/mean_terminated_length": 185.5, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.9140380003689356, "frac_reward_zero_std": 0.0, "grad_norm": 3.546875, "kl": 0.08677652524784207, "learning_rate": 4.4889383524881257e-07, "loss": 0.0035, "num_tokens": 39095724.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 226.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9142224681793027, "frac_reward_zero_std": 1.0, "grad_norm": 0.75, "kl": 0.09046106040477753, "learning_rate": 4.469878724374643e-07, "loss": 0.0036, "num_tokens": 39104063.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9144069359896698, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.06443313229829073, "learning_rate": 4.450858720092599e-07, "loss": 0.0026, "num_tokens": 39111008.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.9145914038000369, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.044092179741710424, "learning_rate": 4.4318783475310935e-07, "loss": 0.0018, "num_tokens": 39116357.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.914775871610404, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.08988836221396923, "learning_rate": 4.412937614562762e-07, "loss": 0.0036, "num_tokens": 39123874.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.914960339420771, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.035583169432356954, "learning_rate": 4.3940365290438104e-07, "loss": 0.0014, "num_tokens": 39128830.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 416.125, "completions/mean_terminated_length": 416.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9151448072311381, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.04079367942176759, "learning_rate": 4.3751750988139997e-07, "loss": 0.0016, "num_tokens": 39141255.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 414.0, "completions/mean_terminated_length": 414.0, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.9153292750415053, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.04345188639126718, "learning_rate": 4.356353331696661e-07, "loss": 0.0017, "num_tokens": 39149431.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 454.125, "completions/mean_terminated_length": 454.125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.9155137428518724, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.08695211401209235, "learning_rate": 4.3375712354986275e-07, "loss": 0.0035, "num_tokens": 39160056.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 554.875, "completions/mean_terminated_length": 554.875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.9156982106622394, "frac_reward_zero_std": 1.0, "grad_norm": 0.0296630859375, "kl": 0.020402985566761345, "learning_rate": 4.3188288180103455e-07, "loss": 0.0008, "num_tokens": 39170775.0, "reward": 1.2727272510528564, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 383.125, "completions/mean_terminated_length": 383.125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9158826784726065, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.03979573480319232, "learning_rate": 4.30012608700574e-07, "loss": 0.0016, "num_tokens": 39179000.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 254.75, "completions/mean_terminated_length": 254.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.9160671462829736, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "kl": 0.04323420044966042, "learning_rate": 4.281463050242285e-07, "loss": 0.0017, "num_tokens": 39183726.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 415.125, "completions/mean_terminated_length": 415.125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.9162516140933407, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.0731120421551168, "learning_rate": 4.2628397154610536e-07, "loss": 0.0029, "num_tokens": 39191239.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9164360819037078, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.1472047781571746, "learning_rate": 4.2442560903865557e-07, "loss": 0.0059, "num_tokens": 39198597.0, "reward": 1.8801021575927734, "reward_std": 0.02543720230460167, "rewards/fixed_code_pass_all_test_reward/mean": 0.8801020383834839, "rewards/fixed_code_pass_all_test_reward/std": 0.02543720230460167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.9166205497140749, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.049189391545951366, "learning_rate": 4.2257121827269154e-07, "loss": 0.002, "num_tokens": 39206821.0, "reward": 1.644230842590332, "reward_std": 0.13598209619522095, "rewards/fixed_code_pass_all_test_reward/mean": 0.6442307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.13598208129405975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.916805017524442, "frac_reward_zero_std": 1.0, "grad_norm": 2.578125, "kl": 0.16878299787640572, "learning_rate": 4.2072080001737123e-07, "loss": 0.0068, "num_tokens": 39217421.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 195.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.9169894853348091, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.08316252077929676, "learning_rate": 4.1887435504020856e-07, "loss": 0.0033, "num_tokens": 39225474.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 361.25, "completions/mean_terminated_length": 361.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9171739531451761, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.05034649115987122, "learning_rate": 4.1703188410707087e-07, "loss": 0.002, "num_tokens": 39237468.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.9173584209555432, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.07300041615962982, "learning_rate": 4.151933879821757e-07, "loss": 0.0029, "num_tokens": 39241387.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 325.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.9175428887659104, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.06610232696402818, "learning_rate": 4.133588674280886e-07, "loss": 0.0026, "num_tokens": 39250375.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.9177273565762775, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.04084022017195821, "learning_rate": 4.11528323205731e-07, "loss": 0.0016, "num_tokens": 39256282.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 619.125, "completions/mean_terminated_length": 619.125, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.9179118243866445, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.020448027702514082, "learning_rate": 4.0970175607437324e-07, "loss": 0.0008, "num_tokens": 39272715.0, "reward": 1.453125, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.453125, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.9180962921970116, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.058401789516210556, "learning_rate": 4.0787916679163595e-07, "loss": 0.0023, "num_tokens": 39278363.0, "reward": 1.296875, "reward_std": 0.5258390307426453, "rewards/fixed_code_pass_all_test_reward/mean": 0.421875, "rewards/fixed_code_pass_all_test_reward/std": 0.17598575353622437, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 427.875, "completions/mean_terminated_length": 427.875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.9182807600073787, "frac_reward_zero_std": 1.0, "grad_norm": 0.035888671875, "kl": 0.02698123815935105, "learning_rate": 4.0606055611348894e-07, "loss": 0.0011, "num_tokens": 39286170.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 344.75, "completions/mean_terminated_length": 344.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.9184652278177458, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.03271307412069291, "learning_rate": 4.042459247942532e-07, "loss": 0.0013, "num_tokens": 39292256.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 395.375, "completions/mean_terminated_length": 395.375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.9186496956281129, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.053474400425329804, "learning_rate": 4.0243527358659774e-07, "loss": 0.0021, "num_tokens": 39301699.0, "reward": 1.78125, "reward_std": 0.3186887204647064, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.3186887204647064, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.91883416343848, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.07105868915095925, "learning_rate": 4.0062860324154407e-07, "loss": 0.0028, "num_tokens": 39310462.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.9190186312488471, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0591706158593297, "learning_rate": 3.988259145084572e-07, "loss": 0.0024, "num_tokens": 39323484.0, "reward": 1.4722222089767456, "reward_std": 0.11501094698905945, "rewards/fixed_code_pass_all_test_reward/mean": 0.472222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.11501093953847885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9192030990592142, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.06384352198801935, "learning_rate": 3.970272081350557e-07, "loss": 0.0026, "num_tokens": 39330947.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.9193875668695812, "frac_reward_zero_std": 1.0, "grad_norm": 0.30078125, "kl": 0.08421983709558845, "learning_rate": 3.9523248486740053e-07, "loss": 0.0034, "num_tokens": 39339512.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 307.75, "completions/mean_terminated_length": 307.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.9195720346799483, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.09024346293881536, "learning_rate": 3.934417454499084e-07, "loss": 0.0036, "num_tokens": 39348838.0, "reward": 1.3486841917037964, "reward_std": 0.4630703032016754, "rewards/fixed_code_pass_all_test_reward/mean": 0.3486842215061188, "rewards/fixed_code_pass_all_test_reward/std": 0.4630703032016754, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 204.375, "completions/mean_terminated_length": 204.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.9197565024903155, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.07255231053568423, "learning_rate": 3.9165499062533863e-07, "loss": 0.0029, "num_tokens": 39359153.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 465.625, "completions/mean_terminated_length": 465.625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.9199409703006826, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.024539513542549685, "learning_rate": 3.898722211347972e-07, "loss": 0.001, "num_tokens": 39373086.0, "reward": 1.2916667461395264, "reward_std": 0.2965855300426483, "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.2965855300426483, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 220.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.9201254381110496, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.06000029668211937, "learning_rate": 3.880934377177381e-07, "loss": 0.0024, "num_tokens": 39380951.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.9203099059214167, "frac_reward_zero_std": 1.0, "grad_norm": 0.60546875, "kl": 0.08751343458425254, "learning_rate": 3.8631864111196127e-07, "loss": 0.0035, "num_tokens": 39385121.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 294.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.9204943737317838, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.06357843382284045, "learning_rate": 3.845478320536178e-07, "loss": 0.0025, "num_tokens": 39393228.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 310.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.9206788415421508, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.03705056104809046, "learning_rate": 3.82781011277199e-07, "loss": 0.0015, "num_tokens": 39400104.0, "reward": 1.8287036418914795, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.8287037014961243, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.920863309352518, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.04473071277607232, "learning_rate": 3.8101817951554323e-07, "loss": 0.0018, "num_tokens": 39406037.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 306.875, "completions/mean_terminated_length": 306.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.9210477771628851, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.08841501316055655, "learning_rate": 3.7925933749983547e-07, "loss": 0.0035, "num_tokens": 39415004.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 1004.625, "completions/mean_terminated_length": 1004.625, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.9212322449732522, "frac_reward_zero_std": 0.0, "grad_norm": 0.482421875, "kl": 0.022050478612072766, "learning_rate": 3.7750448595960665e-07, "loss": 0.0009, "num_tokens": 39440569.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.9214167127836193, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.08474683901295066, "learning_rate": 3.7575362562272897e-07, "loss": 0.0034, "num_tokens": 39448176.0, "reward": 1.7727272510528564, "reward_std": 0.18819968402385712, "rewards/fixed_code_pass_all_test_reward/mean": 0.7727272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.1881996989250183, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 279.0, "completions/mean_terminated_length": 279.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.9216011805939863, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.09258189518004656, "learning_rate": 3.740067572154238e-07, "loss": 0.0037, "num_tokens": 39457360.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 597.375, "completions/mean_terminated_length": 597.375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.9217856484043534, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.035066614334937185, "learning_rate": 3.722638814622526e-07, "loss": 0.0014, "num_tokens": 39473635.0, "reward": 1.375, "reward_std": 0.4432026147842407, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9219701162147206, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.04297756194137037, "learning_rate": 3.705249990861248e-07, "loss": 0.0017, "num_tokens": 39480864.0, "reward": 1.6918604373931885, "reward_std": 0.4561850130558014, "rewards/fixed_code_pass_all_test_reward/mean": 0.9418604373931885, "rewards/fixed_code_pass_all_test_reward/std": 0.02153068408370018, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 370.875, "completions/mean_terminated_length": 370.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9221545840250877, "frac_reward_zero_std": 1.0, "grad_norm": 0.046630859375, "kl": 0.0561925214715302, "learning_rate": 3.6879011080828917e-07, "loss": 0.0022, "num_tokens": 39494703.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.9223390518354547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.05592159740626812, "learning_rate": 3.6705921734834005e-07, "loss": 0.0022, "num_tokens": 39501931.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.9225235196458218, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.24295255006290972, "learning_rate": 3.6533231942421644e-07, "loss": 0.0097, "num_tokens": 39505979.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 421.125, "completions/mean_terminated_length": 421.125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9227079874561889, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.04870702954940498, "learning_rate": 3.636094177521954e-07, "loss": 0.0019, "num_tokens": 39513788.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 365.75, "completions/mean_terminated_length": 365.75, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9228924552665559, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.03635401767678559, "learning_rate": 3.6189051304690194e-07, "loss": 0.0015, "num_tokens": 39520402.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 393.0, "completions/mean_terminated_length": 393.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9230769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.05260018166154623, "learning_rate": 3.601756060213002e-07, "loss": 0.0021, "num_tokens": 39528178.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 166.125, "completions/mean_terminated_length": 166.125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.9232613908872902, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.10474250884726644, "learning_rate": 3.584646973866957e-07, "loss": 0.0042, "num_tokens": 39535963.0, "reward": 1.0833333730697632, "reward_std": 0.1543033868074417, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.15430335700511932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 298.875, "completions/mean_terminated_length": 298.875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9234458586976573, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.046618149150162935, "learning_rate": 3.567577878527373e-07, "loss": 0.0019, "num_tokens": 39545138.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 550.5, "completions/mean_terminated_length": 550.5, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.9236303265080243, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.048687602393329144, "learning_rate": 3.5505487812741213e-07, "loss": 0.0019, "num_tokens": 39558950.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.9238147943183914, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.03101811883971095, "learning_rate": 3.5335596891705406e-07, "loss": 0.0012, "num_tokens": 39567846.0, "reward": 1.8125, "reward_std": 0.1552647203207016, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.15526476502418518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.9239992621287585, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0489976117387414, "learning_rate": 3.5166106092633047e-07, "loss": 0.002, "num_tokens": 39574120.0, "reward": 1.66304349899292, "reward_std": 0.3873157203197479, "rewards/fixed_code_pass_all_test_reward/mean": 0.6630434989929199, "rewards/fixed_code_pass_all_test_reward/std": 0.3873157799243927, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 331.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.9241837299391257, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05062464717775583, "learning_rate": 3.499701548582557e-07, "loss": 0.002, "num_tokens": 39584264.0, "reward": 1.6381579637527466, "reward_std": 0.2996337115764618, "rewards/fixed_code_pass_all_test_reward/mean": 0.6381579041481018, "rewards/fixed_code_pass_all_test_reward/std": 0.2996337413787842, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.9243681977494927, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.06944863661192358, "learning_rate": 3.4828325141417765e-07, "loss": 0.0028, "num_tokens": 39590372.0, "reward": 1.9086538553237915, "reward_std": 0.2583659291267395, "rewards/fixed_code_pass_all_test_reward/mean": 0.9086538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.2583659589290619, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.9245526655598598, "frac_reward_zero_std": 0.0, "grad_norm": 3.875, "kl": 0.12054821103811264, "learning_rate": 3.4660035129378877e-07, "loss": 0.0048, "num_tokens": 39597202.0, "reward": 1.5113636255264282, "reward_std": 0.5232211351394653, "rewards/fixed_code_pass_all_test_reward/mean": 0.5113636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.5232211351394653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9247371333702269, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.07804769556969404, "learning_rate": 3.449214551951219e-07, "loss": 0.0031, "num_tokens": 39603588.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.924921601180594, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.0820810676086694, "learning_rate": 3.4324656381454434e-07, "loss": 0.0033, "num_tokens": 39610890.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.925106068990961, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.05730056297034025, "learning_rate": 3.415756778467649e-07, "loss": 0.0023, "num_tokens": 39617560.0, "reward": 0.9249999523162842, "reward_std": 0.3845219612121582, "rewards/fixed_code_pass_all_test_reward/mean": 0.05000000074505806, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.9252905368013282, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.052450764225795865, "learning_rate": 3.3990879798483145e-07, "loss": 0.0021, "num_tokens": 39625781.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.9254750046116953, "frac_reward_zero_std": 0.0, "grad_norm": 3.1875, "kl": 0.040907337446697056, "learning_rate": 3.3824592492013087e-07, "loss": 0.0016, "num_tokens": 39629872.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.9256594724220624, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0770728844217956, "learning_rate": 3.36587059342387e-07, "loss": 0.0031, "num_tokens": 39634924.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 345.0, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.9258439402324294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.053679333999753, "learning_rate": 3.349322019396595e-07, "loss": 0.0021, "num_tokens": 39641452.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.9260284080427965, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.0813617009553127, "learning_rate": 3.3328135339834923e-07, "loss": 0.0033, "num_tokens": 39646723.0, "reward": 1.566666603088379, "reward_std": 0.38668307662010193, "rewards/fixed_code_pass_all_test_reward/mean": 0.5666666626930237, "rewards/fixed_code_pass_all_test_reward/std": 0.3866831064224243, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 370.5, "completions/mean_terminated_length": 370.5, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.9262128758531636, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.01535928319208324, "learning_rate": 3.316345144031896e-07, "loss": 0.0006, "num_tokens": 39655247.0, "reward": 1.9140625, "reward_std": 0.17014141380786896, "rewards/fixed_code_pass_all_test_reward/mean": 0.9140625, "rewards/fixed_code_pass_all_test_reward/std": 0.17014142870903015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9263973436635307, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.03747258766088635, "learning_rate": 3.299916856372587e-07, "loss": 0.0015, "num_tokens": 39663730.0, "reward": 1.970588207244873, "reward_std": 0.08318904042243958, "rewards/fixed_code_pass_all_test_reward/mean": 0.970588207244873, "rewards/fixed_code_pass_all_test_reward/std": 0.08318902552127838, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 279.0, "completions/mean_terminated_length": 279.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.9265818114738978, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.04144653188996017, "learning_rate": 3.2835286778196363e-07, "loss": 0.0017, "num_tokens": 39669130.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 374.25, "completions/mean_terminated_length": 374.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.9267662792842649, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.08063054434023798, "learning_rate": 3.2671806151704957e-07, "loss": 0.0032, "num_tokens": 39677188.0, "reward": 1.65625, "reward_std": 0.3290272355079651, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.3290272653102875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 260.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.926950747094632, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.037197613972239196, "learning_rate": 3.2508726752060074e-07, "loss": 0.0015, "num_tokens": 39685962.0, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 259.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.9271352149049991, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.07752236537635326, "learning_rate": 3.2346048646903494e-07, "loss": 0.0031, "num_tokens": 39694724.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.9273196827153661, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.05625193798914552, "learning_rate": 3.218377190371047e-07, "loss": 0.0023, "num_tokens": 39699186.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.9275041505257332, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.05155350302811712, "learning_rate": 3.2021896589790156e-07, "loss": 0.0021, "num_tokens": 39708743.0, "reward": 1.40625, "reward_std": 0.31737273931503296, "rewards/fixed_code_pass_all_test_reward/mean": 0.40625, "rewards/fixed_code_pass_all_test_reward/std": 0.31737273931503296, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 379.875, "completions/mean_terminated_length": 379.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9276886183361004, "frac_reward_zero_std": 1.0, "grad_norm": 0.04833984375, "kl": 0.04318996728397906, "learning_rate": 3.1860422772284515e-07, "loss": 0.0017, "num_tokens": 39718862.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.9278730861464675, "frac_reward_zero_std": 1.0, "grad_norm": 0.1875, "kl": 0.07011310756206512, "learning_rate": 3.1699350518169745e-07, "loss": 0.0028, "num_tokens": 39728613.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.9280575539568345, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.10904186917468905, "learning_rate": 3.1538679894255185e-07, "loss": 0.0044, "num_tokens": 39735957.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 349.0, "completions/mean_terminated_length": 349.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.9282420217672016, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.04348604497499764, "learning_rate": 3.1378410967183305e-07, "loss": 0.0017, "num_tokens": 39745301.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.9284264895775687, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.04507743986323476, "learning_rate": 3.1218543803430257e-07, "loss": 0.0018, "num_tokens": 39752671.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9286109573879358, "frac_reward_zero_std": 1.0, "grad_norm": 0.3046875, "kl": 0.06658057402819395, "learning_rate": 3.105907846930534e-07, "loss": 0.0027, "num_tokens": 39757123.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 110.125, "completions/mean_terminated_length": 110.125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.9287954251983029, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.03997925599105656, "learning_rate": 3.090001503095175e-07, "loss": 0.0016, "num_tokens": 39760708.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 428.875, "completions/mean_terminated_length": 428.875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.92897989300867, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.030504170688800514, "learning_rate": 3.0741353554345267e-07, "loss": 0.0012, "num_tokens": 39769771.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9291643608190371, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.05626562424004078, "learning_rate": 3.0583094105295253e-07, "loss": 0.0023, "num_tokens": 39779363.0, "reward": 1.3035714626312256, "reward_std": 0.42984506487846375, "rewards/fixed_code_pass_all_test_reward/mean": 0.3035714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.42984503507614136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.9293488286294042, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.029891030164435506, "learning_rate": 3.042523674944431e-07, "loss": 0.0012, "num_tokens": 39789023.0, "reward": 1.39453125, "reward_std": 0.38452833890914917, "rewards/fixed_code_pass_all_test_reward/mean": 0.51953125, "rewards/fixed_code_pass_all_test_reward/std": 0.20992232859134674, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.9295332964397712, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.06433066190220416, "learning_rate": 3.0267781552268394e-07, "loss": 0.0026, "num_tokens": 39795172.0, "reward": 1.683333396911621, "reward_std": 0.3629683554172516, "rewards/fixed_code_pass_all_test_reward/mean": 0.6833333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.362968385219574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 331.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.9297177642501383, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.06489445548504591, "learning_rate": 3.011072857907649e-07, "loss": 0.0026, "num_tokens": 39801692.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 255.375, "completions/mean_terminated_length": 255.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9299022320605055, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.02751614013686776, "learning_rate": 2.99540778950107e-07, "loss": 0.0011, "num_tokens": 39806711.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.9300866998708726, "frac_reward_zero_std": 1.0, "grad_norm": 0.04541015625, "kl": 0.029847529833205044, "learning_rate": 2.979782956504629e-07, "loss": 0.0012, "num_tokens": 39814624.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9302711676812396, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.038087685126811266, "learning_rate": 2.9641983653991737e-07, "loss": 0.0015, "num_tokens": 39824052.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.9304556354916067, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.07416649954393506, "learning_rate": 2.9486540226488556e-07, "loss": 0.003, "num_tokens": 39832975.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.9306401033019738, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.06999574322253466, "learning_rate": 2.933149934701152e-07, "loss": 0.0028, "num_tokens": 39839525.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.9308245711123408, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.06497129378840327, "learning_rate": 2.9176861079868056e-07, "loss": 0.0026, "num_tokens": 39848068.0, "reward": 1.8928570747375488, "reward_std": 0.14028292894363403, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.14028292894363403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.931009038922708, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.048377772560343146, "learning_rate": 2.902262548919876e-07, "loss": 0.0019, "num_tokens": 39852444.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 277.5, "completions/mean_terminated_length": 277.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.9311935067330751, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.06345727946609259, "learning_rate": 2.886879263897724e-07, "loss": 0.0025, "num_tokens": 39863456.0, "reward": 1.5333333015441895, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5333333611488342, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.9313779745434422, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.055331150302663445, "learning_rate": 2.871536259301011e-07, "loss": 0.0022, "num_tokens": 39868525.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 5049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 434.125, "completions/mean_terminated_length": 434.125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9315624423538093, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.04852217226289213, "learning_rate": 2.8562335414936915e-07, "loss": 0.0019, "num_tokens": 39880694.0, "reward": 1.3839285373687744, "reward_std": 0.7392943501472473, "rewards/fixed_code_pass_all_test_reward/mean": 0.5089285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.5254977941513062, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.9317469101641763, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.03913004812784493, "learning_rate": 2.840971116822977e-07, "loss": 0.0016, "num_tokens": 39886299.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.9319313779745434, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.04741193703375757, "learning_rate": 2.825748991619415e-07, "loss": 0.0019, "num_tokens": 39890989.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 248.5, "completions/mean_terminated_length": 248.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.9321158457849106, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.05126552144065499, "learning_rate": 2.8105671721967874e-07, "loss": 0.0021, "num_tokens": 39898753.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.9323003135952777, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.048742849088739604, "learning_rate": 2.7954256648522136e-07, "loss": 0.0019, "num_tokens": 39904150.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 311.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.9324847814056447, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.04309336398728192, "learning_rate": 2.7803244758660477e-07, "loss": 0.0017, "num_tokens": 39910878.0, "reward": 1.7034574747085571, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.7034574747085571, "rewards/fixed_code_pass_all_test_reward/std": 0.176776722073555, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 367.375, "completions/mean_terminated_length": 367.375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9326692492160118, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0628190222196281, "learning_rate": 2.765263611501956e-07, "loss": 0.0025, "num_tokens": 39926241.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.9328537170263789, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.06563755450770259, "learning_rate": 2.75024307800682e-07, "loss": 0.0026, "num_tokens": 39935134.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.9330381848367459, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.030307218490634114, "learning_rate": 2.7352628816108785e-07, "loss": 0.0012, "num_tokens": 39946165.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 393.0, "completions/mean_terminated_length": 393.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.9332226526471131, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759765625, "kl": 0.0298910578712821, "learning_rate": 2.7203230285275716e-07, "loss": 0.0012, "num_tokens": 39957069.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 95.375, "completions/mean_terminated_length": 95.375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.9334071204574802, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.05620970972813666, "learning_rate": 2.7054235249536207e-07, "loss": 0.0022, "num_tokens": 39960608.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 338.625, "completions/mean_terminated_length": 338.625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.9335915882678473, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.09419998759403825, "learning_rate": 2.690564377069027e-07, "loss": 0.0038, "num_tokens": 39967709.0, "reward": 1.5446429252624512, "reward_std": 0.3123724162578583, "rewards/fixed_code_pass_all_test_reward/mean": 0.6696428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.2587745785713196, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 308.25, "completions/mean_terminated_length": 308.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9337760560782143, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.025782336248084903, "learning_rate": 2.6757455910370486e-07, "loss": 0.001, "num_tokens": 39974687.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.9339605238885814, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05772076663561165, "learning_rate": 2.6609671730041917e-07, "loss": 0.0023, "num_tokens": 39980852.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 128.875, "completions/mean_terminated_length": 128.875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.9341449916989485, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.04221416520886123, "learning_rate": 2.6462291291002305e-07, "loss": 0.0017, "num_tokens": 39984603.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.9343294595093157, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.08442370686680079, "learning_rate": 2.631531465438186e-07, "loss": 0.0034, "num_tokens": 39992076.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 254.75, "completions/mean_terminated_length": 254.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.9345139273196827, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.07286844216287136, "learning_rate": 2.616874188114316e-07, "loss": 0.0029, "num_tokens": 39998426.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.9346983951300498, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.05659839848522097, "learning_rate": 2.6022573032081666e-07, "loss": 0.0023, "num_tokens": 40007082.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.9348828629404169, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.05439062509685755, "learning_rate": 2.5876808167825005e-07, "loss": 0.0022, "num_tokens": 40015858.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 194.875, "completions/mean_terminated_length": 194.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.935067330750784, "frac_reward_zero_std": 1.0, "grad_norm": 0.052490234375, "kl": 0.022413723170757294, "learning_rate": 2.573144734883315e-07, "loss": 0.0009, "num_tokens": 40020241.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.935251798561151, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.06574002653360367, "learning_rate": 2.558649063539864e-07, "loss": 0.0026, "num_tokens": 40031050.0, "reward": 1.879166603088379, "reward_std": 0.3417682945728302, "rewards/fixed_code_pass_all_test_reward/mean": 0.8791666626930237, "rewards/fixed_code_pass_all_test_reward/std": 0.3417682647705078, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.9354362663715182, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.09100389527156949, "learning_rate": 2.544193808764661e-07, "loss": 0.0036, "num_tokens": 40037776.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 182.125, "completions/mean_terminated_length": 182.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.9356207341818853, "frac_reward_zero_std": 1.0, "grad_norm": 0.271484375, "kl": 0.04938281839713454, "learning_rate": 2.5297789765534206e-07, "loss": 0.002, "num_tokens": 40046625.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 336.25, "completions/mean_terminated_length": 336.25, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.9358052019922524, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.07438225415535271, "learning_rate": 2.515404572885094e-07, "loss": 0.003, "num_tokens": 40056315.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 320.125, "completions/mean_terminated_length": 320.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.9359896698026194, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.061915343394503, "learning_rate": 2.501070603721889e-07, "loss": 0.0025, "num_tokens": 40063020.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 451.75, "completions/mean_terminated_length": 451.75, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.9361741376129865, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.08270094078034163, "learning_rate": 2.486777075009206e-07, "loss": 0.0033, "num_tokens": 40071074.0, "reward": 1.375, "reward_std": 0.5382592678070068, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.15625910460948944, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 5075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9363586054233536, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.05625135265290737, "learning_rate": 2.472523992675724e-07, "loss": 0.0023, "num_tokens": 40078000.0, "reward": 1.96875, "reward_std": 0.05818168818950653, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.05818169191479683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 422.375, "completions/mean_terminated_length": 422.375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9365430732337208, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.05569751490838826, "learning_rate": 2.458311362633292e-07, "loss": 0.0022, "num_tokens": 40090459.0, "reward": 1.8538135290145874, "reward_std": 0.3472287058830261, "rewards/fixed_code_pass_all_test_reward/mean": 0.8538135290145874, "rewards/fixed_code_pass_all_test_reward/std": 0.3472287058830261, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.9367275410440878, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.10797767480835319, "learning_rate": 2.4441391907770064e-07, "loss": 0.0043, "num_tokens": 40099157.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9369120088544549, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.03473189508076757, "learning_rate": 2.430007482985164e-07, "loss": 0.0014, "num_tokens": 40107217.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.937096476664822, "frac_reward_zero_std": 1.0, "grad_norm": 0.2099609375, "kl": 0.08814511820673943, "learning_rate": 2.41591624511931e-07, "loss": 0.0035, "num_tokens": 40115497.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9372809444751891, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.03668362111784518, "learning_rate": 2.4018654830241904e-07, "loss": 0.0015, "num_tokens": 40123000.0, "reward": 1.298076868057251, "reward_std": 0.29489168524742126, "rewards/fixed_code_pass_all_test_reward/mean": 0.29807692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.29489171504974365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.9374654122855561, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.04481459828093648, "learning_rate": 2.3878552025277447e-07, "loss": 0.0018, "num_tokens": 40128851.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.9376498800959233, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0542614315636456, "learning_rate": 2.3738854094411347e-07, "loss": 0.0022, "num_tokens": 40137445.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 302.5, "completions/mean_terminated_length": 302.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.9378343479062904, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.07052958523854613, "learning_rate": 2.3599561095587364e-07, "loss": 0.0028, "num_tokens": 40143033.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 5084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 217.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.9380188157166575, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.04219975811429322, "learning_rate": 2.3460673086581064e-07, "loss": 0.0017, "num_tokens": 40150289.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 265.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.9382032835270245, "frac_reward_zero_std": 1.0, "grad_norm": 0.046875, "kl": 0.06844292255118489, "learning_rate": 2.3322190125000477e-07, "loss": 0.0027, "num_tokens": 40156293.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 241.625, "completions/mean_terminated_length": 241.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.9383877513373916, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.0438910573720932, "learning_rate": 2.3184112268285208e-07, "loss": 0.0018, "num_tokens": 40161562.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9385722191477587, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.058349909260869026, "learning_rate": 2.304643957370689e-07, "loss": 0.0023, "num_tokens": 40167957.0, "reward": 1.975000023841858, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.9387566869581258, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.042486088001169264, "learning_rate": 2.290917209836918e-07, "loss": 0.0017, "num_tokens": 40176341.0, "reward": 1.9330357313156128, "reward_std": 0.18940357863903046, "rewards/fixed_code_pass_all_test_reward/mean": 0.9330357313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.18940360844135284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 393.25, "completions/mean_terminated_length": 393.25, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9389411547684929, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.024899858806747943, "learning_rate": 2.2772309899207868e-07, "loss": 0.001, "num_tokens": 40183487.0, "reward": 1.9249999523162842, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 299.25, "completions/mean_terminated_length": 299.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.93912562257886, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.06078976625576615, "learning_rate": 2.2635853032990206e-07, "loss": 0.0024, "num_tokens": 40192601.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 317.25, "completions/mean_terminated_length": 317.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.9393100903892271, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.06599288806319237, "learning_rate": 2.2499801556315704e-07, "loss": 0.0026, "num_tokens": 40199027.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.9394945581995942, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.05913840956054628, "learning_rate": 2.236415552561544e-07, "loss": 0.0024, "num_tokens": 40206553.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 376.5, "completions/mean_terminated_length": 376.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.9396790260099612, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.05529837519861758, "learning_rate": 2.222891499715252e-07, "loss": 0.0022, "num_tokens": 40214189.0, "reward": 1.932692289352417, "reward_std": 0.09364870935678482, "rewards/fixed_code_pass_all_test_reward/mean": 0.932692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.09364869445562363, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 400.75, "completions/mean_terminated_length": 400.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9398634938203283, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.07176515809260309, "learning_rate": 2.2094080027021959e-07, "loss": 0.0029, "num_tokens": 40220275.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.9400479616306955, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.058903482742607594, "learning_rate": 2.1959650671150134e-07, "loss": 0.0024, "num_tokens": 40228632.0, "reward": 1.9500000476837158, "reward_std": 0.09636241942644119, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.0963624119758606, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.9402324294410626, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.07160257967188954, "learning_rate": 2.1825626985295666e-07, "loss": 0.0029, "num_tokens": 40237090.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 489.0, "completions/mean_terminated_length": 489.0, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.9404168972514296, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.035299477400258183, "learning_rate": 2.169200902504842e-07, "loss": 0.0014, "num_tokens": 40246018.0, "reward": 1.1124999523162842, "reward_std": 0.035355351865291595, "rewards/fixed_code_pass_all_test_reward/mean": 0.11249999701976776, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 394.25, "completions/mean_terminated_length": 394.25, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.9406013650617967, "frac_reward_zero_std": 1.0, "grad_norm": 0.046630859375, "kl": 0.0348573827650398, "learning_rate": 2.1558796845830399e-07, "loss": 0.0014, "num_tokens": 40253436.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 363.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.9407858328721638, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.04301724175456911, "learning_rate": 2.1425990502895179e-07, "loss": 0.0017, "num_tokens": 40263999.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.9409703006825308, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.06973888585343957, "learning_rate": 2.129359005132803e-07, "loss": 0.0028, "num_tokens": 40268403.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 219.625, "completions/mean_terminated_length": 219.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.941154768492898, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.051676519215106964, "learning_rate": 2.1161595546045466e-07, "loss": 0.0021, "num_tokens": 40273976.0, "reward": 1.817307710647583, "reward_std": 0.027196446433663368, "rewards/fixed_code_pass_all_test_reward/mean": 0.817307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.9413392363032651, "frac_reward_zero_std": 1.0, "grad_norm": 0.53515625, "kl": 0.10600506560876966, "learning_rate": 2.1030007041796252e-07, "loss": 0.0042, "num_tokens": 40282519.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 542.0, "completions/mean_terminated_length": 542.0, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.9415237041136322, "frac_reward_zero_std": 1.0, "grad_norm": 0.03515625, "kl": 0.01804348104633391, "learning_rate": 2.0898824593160504e-07, "loss": 0.0007, "num_tokens": 40293143.0, "reward": 1.2727272510528564, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.9417081719239992, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.0584296933375299, "learning_rate": 2.07680482545497e-07, "loss": 0.0023, "num_tokens": 40301564.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 264.625, "completions/mean_terminated_length": 264.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.9418926397343663, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.0460121629294008, "learning_rate": 2.0637678080207002e-07, "loss": 0.0018, "num_tokens": 40306617.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.9420771075447334, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.07992794550955296, "learning_rate": 2.0507714124207157e-07, "loss": 0.0032, "num_tokens": 40314364.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9422615753551006, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.047154433792456985, "learning_rate": 2.037815644045671e-07, "loss": 0.0019, "num_tokens": 40325099.0, "reward": 1.730263113975525, "reward_std": 0.1089901402592659, "rewards/fixed_code_pass_all_test_reward/mean": 0.7302631139755249, "rewards/fixed_code_pass_all_test_reward/std": 0.1089901551604271, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 343.375, "completions/mean_terminated_length": 343.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.9424460431654677, "frac_reward_zero_std": 1.0, "grad_norm": 0.047607421875, "kl": 0.022906065685674548, "learning_rate": 2.0249005082693008e-07, "loss": 0.0009, "num_tokens": 40331942.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.9426305109758347, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.05648009665310383, "learning_rate": 2.0120260104485422e-07, "loss": 0.0023, "num_tokens": 40336345.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 221.25, "completions/mean_terminated_length": 221.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9428149787862018, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.04488317831419408, "learning_rate": 1.9991921559234572e-07, "loss": 0.0018, "num_tokens": 40343859.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9429994465965689, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.04050062980968505, "learning_rate": 1.9863989500172543e-07, "loss": 0.0016, "num_tokens": 40350074.0, "reward": 1.6644736528396606, "reward_std": 0.3586927354335785, "rewards/fixed_code_pass_all_test_reward/mean": 0.6644736528396606, "rewards/fixed_code_pass_all_test_reward/std": 0.3586927354335785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.9431839144069359, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.05874213110655546, "learning_rate": 1.9736463980362665e-07, "loss": 0.0023, "num_tokens": 40354659.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.9433683822173031, "frac_reward_zero_std": 1.0, "grad_norm": 0.796875, "kl": 0.12244824832305312, "learning_rate": 1.9609345052699957e-07, "loss": 0.0049, "num_tokens": 40364185.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.9435528500276702, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.044695014134049416, "learning_rate": 1.948263276991047e-07, "loss": 0.0018, "num_tokens": 40372197.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.9437373178380373, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.04672929993830621, "learning_rate": 1.9356327184551716e-07, "loss": 0.0019, "num_tokens": 40381527.0, "reward": 1.454545497894287, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4545454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 425.0, "completions/mean_terminated_length": 425.0, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.9439217856484043, "frac_reward_zero_std": 1.0, "grad_norm": 0.3203125, "kl": 0.03375612257514149, "learning_rate": 1.9230428349012674e-07, "loss": 0.0014, "num_tokens": 40388719.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9441062534587714, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.07673407392576337, "learning_rate": 1.9104936315513244e-07, "loss": 0.0031, "num_tokens": 40396021.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9442907212691385, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.07327688625082374, "learning_rate": 1.897985113610501e-07, "loss": 0.0029, "num_tokens": 40404532.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.9444751890795057, "frac_reward_zero_std": 1.0, "grad_norm": 0.58984375, "kl": 0.10824392596259713, "learning_rate": 1.8855172862670467e-07, "loss": 0.0043, "num_tokens": 40409237.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 182.125, "completions/mean_terminated_length": 182.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.9446596568898727, "frac_reward_zero_std": 1.0, "grad_norm": 0.330078125, "kl": 0.07042802963405848, "learning_rate": 1.8730901546923585e-07, "loss": 0.0028, "num_tokens": 40415974.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9448441247002398, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.056787248235195875, "learning_rate": 1.8607037240409353e-07, "loss": 0.0023, "num_tokens": 40420167.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 624.625, "completions/mean_terminated_length": 624.625, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.9450285925106069, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.03252625372260809, "learning_rate": 1.8483579994503896e-07, "loss": 0.0013, "num_tokens": 40435436.0, "reward": 1.980769157409668, "reward_std": 0.03560848906636238, "rewards/fixed_code_pass_all_test_reward/mean": 0.9807692170143127, "rewards/fixed_code_pass_all_test_reward/std": 0.03560846298933029, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 550.75, "completions/mean_terminated_length": 550.75, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.945213060320974, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.060397819615900517, "learning_rate": 1.836052986041481e-07, "loss": 0.0024, "num_tokens": 40450754.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 298.875, "completions/mean_terminated_length": 298.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.945397528131341, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.057516380213201046, "learning_rate": 1.823788688918049e-07, "loss": 0.0023, "num_tokens": 40459793.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.9455819959417082, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.051785388961434364, "learning_rate": 1.8115651131670796e-07, "loss": 0.0021, "num_tokens": 40466141.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 363.0, "completions/mean_terminated_length": 363.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.9457664637520753, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.054792941780760884, "learning_rate": 1.799382263858629e-07, "loss": 0.0022, "num_tokens": 40476053.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 435.625, "completions/mean_terminated_length": 435.625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.9459509315624424, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.04770046076737344, "learning_rate": 1.7872401460458878e-07, "loss": 0.0019, "num_tokens": 40488578.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 251.875, "completions/mean_terminated_length": 251.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9461353993728094, "frac_reward_zero_std": 1.0, "grad_norm": 0.2275390625, "kl": 0.05892679514363408, "learning_rate": 1.7751387647651385e-07, "loss": 0.0024, "num_tokens": 40493777.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.9463198671831765, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.05566463200375438, "learning_rate": 1.7630781250357776e-07, "loss": 0.0022, "num_tokens": 40503852.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 383.625, "completions/mean_terminated_length": 383.625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.9465043349935436, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.050729077658616006, "learning_rate": 1.751058231860292e-07, "loss": 0.002, "num_tokens": 40512097.0, "reward": 1.8441557884216309, "reward_std": 0.28856727480888367, "rewards/fixed_code_pass_all_test_reward/mean": 0.8441558480262756, "rewards/fixed_code_pass_all_test_reward/std": 0.28856733441352844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 743.25, "completions/mean_terminated_length": 743.25, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.9466888028039108, "frac_reward_zero_std": 0.0, "grad_norm": 0.625, "kl": 0.023052409989759326, "learning_rate": 1.7390790902242828e-07, "loss": 0.0009, "num_tokens": 40527643.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.9468732706142778, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.059664583997800946, "learning_rate": 1.7271407050964417e-07, "loss": 0.0024, "num_tokens": 40532242.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.9470577384246449, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.10573521303012967, "learning_rate": 1.7152430814285303e-07, "loss": 0.0042, "num_tokens": 40536134.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 5134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 177.5, "completions/mean_terminated_length": 177.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.947242206235012, "frac_reward_zero_std": 1.0, "grad_norm": 0.2255859375, "kl": 0.041048758663237095, "learning_rate": 1.7033862241554566e-07, "loss": 0.0016, "num_tokens": 40544162.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.9474266740453791, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.05326577881351113, "learning_rate": 1.6915701381951754e-07, "loss": 0.0021, "num_tokens": 40549411.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 453.625, "completions/mean_terminated_length": 453.625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.9476111418557461, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.04368264181539416, "learning_rate": 1.679794828448733e-07, "loss": 0.0017, "num_tokens": 40558664.0, "reward": 1.0285714864730835, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.02857142873108387, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 393.0, "completions/mean_terminated_length": 393.0, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.9477956096661133, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.025289216893725097, "learning_rate": 1.6680602998002892e-07, "loss": 0.001, "num_tokens": 40567648.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 287.0, "completions/mean_terminated_length": 287.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.9479800774764804, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.045692778658121824, "learning_rate": 1.6563665571170618e-07, "loss": 0.0018, "num_tokens": 40576136.0, "reward": 1.8815789222717285, "reward_std": 0.33494532108306885, "rewards/fixed_code_pass_all_test_reward/mean": 0.8815789222717285, "rewards/fixed_code_pass_all_test_reward/std": 0.33494532108306885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.9481645452868475, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.07687342446297407, "learning_rate": 1.6447136052493708e-07, "loss": 0.0031, "num_tokens": 40584511.0, "reward": 1.7999999523162842, "reward_std": 0.053452230989933014, "rewards/fixed_code_pass_all_test_reward/mean": 0.7999999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.05345224589109421, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9483490130972145, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.05385161261074245, "learning_rate": 1.6331014490306163e-07, "loss": 0.0022, "num_tokens": 40589597.0, "reward": 1.6895161867141724, "reward_std": 0.34396296739578247, "rewards/fixed_code_pass_all_test_reward/mean": 0.6895161271095276, "rewards/fixed_code_pass_all_test_reward/std": 0.34396299719810486, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.9485334809075816, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.06195459980517626, "learning_rate": 1.6215300932772572e-07, "loss": 0.0025, "num_tokens": 40597314.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.9487179487179487, "frac_reward_zero_std": 0.0, "grad_norm": 3.703125, "kl": 0.11955638788640499, "learning_rate": 1.6099995427888315e-07, "loss": 0.0048, "num_tokens": 40604007.0, "reward": 1.5625, "reward_std": 0.7288689613342285, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9489024165283159, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.067227449035272, "learning_rate": 1.5985098023479806e-07, "loss": 0.0027, "num_tokens": 40614640.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 328.125, "completions/mean_terminated_length": 328.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.9490868843386829, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.05980717157945037, "learning_rate": 1.5870608767204032e-07, "loss": 0.0024, "num_tokens": 40621441.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 362.25, "completions/mean_terminated_length": 362.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.94927135214905, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.040057759964838624, "learning_rate": 1.5756527706548565e-07, "loss": 0.0016, "num_tokens": 40628099.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 376.75, "completions/mean_terminated_length": 376.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9494558199594171, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.06453610793687403, "learning_rate": 1.5642854888831882e-07, "loss": 0.0026, "num_tokens": 40639689.0, "reward": 1.2999999523162842, "reward_std": 0.28507864475250244, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.28507867455482483, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 313.0, "completions/mean_terminated_length": 313.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.9496402877697842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458984375, "kl": 0.03100928384810686, "learning_rate": 1.5529590361202719e-07, "loss": 0.0012, "num_tokens": 40649881.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.9498247555801512, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.05520693492144346, "learning_rate": 1.5416734170641155e-07, "loss": 0.0022, "num_tokens": 40655970.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.9500092233905184, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.08161433436907828, "learning_rate": 1.530428636395731e-07, "loss": 0.0033, "num_tokens": 40660092.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.9501936912008855, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.04810501204337925, "learning_rate": 1.519224698779198e-07, "loss": 0.0019, "num_tokens": 40670282.0, "reward": 1.8888888359069824, "reward_std": 0.05939139798283577, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.059391383081674576, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 398.5, "completions/mean_terminated_length": 398.5, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.9503781590112526, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.07864664401859045, "learning_rate": 1.5080616088616884e-07, "loss": 0.0031, "num_tokens": 40677830.0, "reward": 1.4553570747375488, "reward_std": 0.24128389358520508, "rewards/fixed_code_pass_all_test_reward/mean": 0.4553571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.24128392338752747, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 247.625, "completions/mean_terminated_length": 247.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9505626268216196, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.035298251546919346, "learning_rate": 1.496939371273398e-07, "loss": 0.0014, "num_tokens": 40683963.0, "reward": 1.7440476417541504, "reward_std": 0.1212383285164833, "rewards/fixed_code_pass_all_test_reward/mean": 0.7440476417541504, "rewards/fixed_code_pass_all_test_reward/std": 0.1212383583188057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.9507470946319867, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.05516406614333391, "learning_rate": 1.485857990627604e-07, "loss": 0.0022, "num_tokens": 40690056.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.9509315624423538, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.1124630281701684, "learning_rate": 1.4748174715206066e-07, "loss": 0.0045, "num_tokens": 40696900.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9511160302527208, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.0482845522928983, "learning_rate": 1.4638178185317764e-07, "loss": 0.0019, "num_tokens": 40701432.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 150.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.951300498063088, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.11234329687431455, "learning_rate": 1.4528590362235527e-07, "loss": 0.0045, "num_tokens": 40705371.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 514.375, "completions/mean_terminated_length": 514.375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.9514849658734551, "frac_reward_zero_std": 1.0, "grad_norm": 0.048583984375, "kl": 0.02726498490665108, "learning_rate": 1.4419411291413888e-07, "loss": 0.0011, "num_tokens": 40718342.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.9516694336838222, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.05826803855597973, "learning_rate": 1.431064101813795e-07, "loss": 0.0023, "num_tokens": 40726049.0, "reward": 1.5458333492279053, "reward_std": 0.21817244589328766, "rewards/fixed_code_pass_all_test_reward/mean": 0.5458333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.21817243099212646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9518539014941892, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.2096336642280221, "learning_rate": 1.4202279587523292e-07, "loss": 0.0084, "num_tokens": 40736668.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.9520383693045563, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.054355359403416514, "learning_rate": 1.4094327044515853e-07, "loss": 0.0022, "num_tokens": 40741247.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.9522228371149234, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.03356174658983946, "learning_rate": 1.3986783433892038e-07, "loss": 0.0013, "num_tokens": 40748429.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 393.75, "completions/mean_terminated_length": 393.75, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9524073049252906, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.038453968707472086, "learning_rate": 1.3879648800258606e-07, "loss": 0.0015, "num_tokens": 40755211.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9525917727356576, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.054237571312114596, "learning_rate": 1.3772923188052788e-07, "loss": 0.0022, "num_tokens": 40763807.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.9527762405460247, "frac_reward_zero_std": 1.0, "grad_norm": 0.5, "kl": 0.08094883826561272, "learning_rate": 1.3666606641541957e-07, "loss": 0.0032, "num_tokens": 40769751.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 139.125, "completions/mean_terminated_length": 139.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.9529607083563918, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.08306427579373121, "learning_rate": 1.3560699204823834e-07, "loss": 0.0033, "num_tokens": 40775664.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 262.25, "completions/mean_terminated_length": 262.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.9531451761667589, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.04587854235433042, "learning_rate": 1.3455200921826838e-07, "loss": 0.0018, "num_tokens": 40785626.0, "reward": 1.9090909957885742, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9533296439771259, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.08373581315390766, "learning_rate": 1.3350111836309076e-07, "loss": 0.0033, "num_tokens": 40793630.0, "reward": 1.65625, "reward_std": 0.6903350353240967, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.3582572042942047, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9535141117874931, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.03756368602626026, "learning_rate": 1.3245431991859457e-07, "loss": 0.0015, "num_tokens": 40800070.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.9536985795978602, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.05549324303865433, "learning_rate": 1.3141161431896809e-07, "loss": 0.0022, "num_tokens": 40808048.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.9538830474082273, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.02531611663289368, "learning_rate": 1.30373001996702e-07, "loss": 0.001, "num_tokens": 40816318.0, "reward": 1.2291667461395264, "reward_std": 0.04740733280777931, "rewards/fixed_code_pass_all_test_reward/mean": 0.2291666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.04740730673074722, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.9540675152185943, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.07537577208131552, "learning_rate": 1.293384833825917e-07, "loss": 0.003, "num_tokens": 40822718.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.9542519830289614, "frac_reward_zero_std": 1.0, "grad_norm": 0.55078125, "kl": 0.06126613914966583, "learning_rate": 1.2830805890573396e-07, "loss": 0.0025, "num_tokens": 40830016.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 433.75, "completions/mean_terminated_length": 433.75, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.9544364508393285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0289306640625, "kl": 0.015590569004416466, "learning_rate": 1.2728172899352466e-07, "loss": 0.0006, "num_tokens": 40841534.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.9546209186496957, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.044284747331403196, "learning_rate": 1.2625949407166437e-07, "loss": 0.0018, "num_tokens": 40846697.0, "reward": 1.6749999523162842, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 285.75, "completions/mean_terminated_length": 285.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.9548053864600627, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.040732945897616446, "learning_rate": 1.2524135456415286e-07, "loss": 0.0016, "num_tokens": 40853727.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 302.625, "completions/mean_terminated_length": 302.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.9549898542704298, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.04648428503423929, "learning_rate": 1.2422731089329453e-07, "loss": 0.0019, "num_tokens": 40859060.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.9551743220807969, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.04675884498283267, "learning_rate": 1.232173634796918e-07, "loss": 0.0019, "num_tokens": 40863290.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.955358789891164, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.06375135807320476, "learning_rate": 1.2221151274224853e-07, "loss": 0.0026, "num_tokens": 40870239.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.955543257701531, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.0764386672526598, "learning_rate": 1.2120975909816978e-07, "loss": 0.0031, "num_tokens": 40877706.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9557277255118982, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.03448250982910395, "learning_rate": 1.2021210296296216e-07, "loss": 0.0014, "num_tokens": 40882931.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.9559121933222653, "frac_reward_zero_std": 1.0, "grad_norm": 0.0400390625, "kl": 0.017074194620363414, "learning_rate": 1.1921854475043127e-07, "loss": 0.0007, "num_tokens": 40888247.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 526.0, "completions/mean_terminated_length": 526.0, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.9560966611326324, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.030766631505684927, "learning_rate": 1.1822908487268525e-07, "loss": 0.0012, "num_tokens": 40902615.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.9562811289429994, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.03669420047663152, "learning_rate": 1.1724372374012694e-07, "loss": 0.0015, "num_tokens": 40911697.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 582.0, "completions/mean_terminated_length": 582.0, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.9564655967533665, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.029882304137572646, "learning_rate": 1.1626246176146716e-07, "loss": 0.0012, "num_tokens": 40921945.0, "reward": 1.9642857313156128, "reward_std": 0.10101523250341415, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.9566500645637336, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.062129780650138855, "learning_rate": 1.1528529934370924e-07, "loss": 0.0025, "num_tokens": 40931601.0, "reward": 1.5267857313156128, "reward_std": 0.14253784716129303, "rewards/fixed_code_pass_all_test_reward/mean": 0.5267857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.1425379067659378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.9568345323741008, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.06958970078267157, "learning_rate": 1.143122368921612e-07, "loss": 0.0028, "num_tokens": 40940453.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 162.25, "completions/mean_terminated_length": 162.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.9570190001844678, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.025383178028278053, "learning_rate": 1.1334327481042573e-07, "loss": 0.001, "num_tokens": 40946967.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 405.25, "completions/mean_terminated_length": 405.25, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.9572034679948349, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.03312348551116884, "learning_rate": 1.1237841350040912e-07, "loss": 0.0013, "num_tokens": 40955921.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.957387935805202, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.042340911692008376, "learning_rate": 1.114176533623157e-07, "loss": 0.0017, "num_tokens": 40962670.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 457.375, "completions/mean_terminated_length": 457.375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.9575724036155691, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.027249174076132476, "learning_rate": 1.1046099479464668e-07, "loss": 0.0011, "num_tokens": 40971777.0, "reward": 1.005357027053833, "reward_std": 0.007393531501293182, "rewards/fixed_code_pass_all_test_reward/mean": 0.0053571430034935474, "rewards/fixed_code_pass_all_test_reward/std": 0.007393559440970421, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 228.75, "completions/mean_terminated_length": 228.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9577568714259361, "frac_reward_zero_std": 1.0, "grad_norm": 0.51953125, "kl": 0.07510224217548966, "learning_rate": 1.0950843819420354e-07, "loss": 0.003, "num_tokens": 40977935.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 301.125, "completions/mean_terminated_length": 301.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.9579413392363033, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.05353745887987316, "learning_rate": 1.085599839560858e-07, "loss": 0.0021, "num_tokens": 40987240.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 298.0, "completions/mean_terminated_length": 298.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.9581258070466704, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.028299424797296524, "learning_rate": 1.0761563247369323e-07, "loss": 0.0011, "num_tokens": 40994448.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 352.25, "completions/mean_terminated_length": 352.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.9583102748570375, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.045474061043933034, "learning_rate": 1.0667538413872136e-07, "loss": 0.0018, "num_tokens": 41001890.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 370.625, "completions/mean_terminated_length": 370.625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.9584947426674045, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "kl": 0.19800453691277653, "learning_rate": 1.0573923934116604e-07, "loss": 0.0079, "num_tokens": 41010247.0, "reward": 1.933510661125183, "reward_std": 0.04321296885609627, "rewards/fixed_code_pass_all_test_reward/mean": 0.9335106611251831, "rewards/fixed_code_pass_all_test_reward/std": 0.04321296513080597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9586792104777716, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.06454876018688083, "learning_rate": 1.0480719846931775e-07, "loss": 0.0026, "num_tokens": 41018479.0, "reward": 1.10546875, "reward_std": 0.1180400550365448, "rewards/fixed_code_pass_all_test_reward/mean": 0.10546875, "rewards/fixed_code_pass_all_test_reward/std": 0.1180400550365448, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9588636782881387, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.056771908421069384, "learning_rate": 1.0387926190976838e-07, "loss": 0.0023, "num_tokens": 41022680.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.9590481460985059, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.05158998561091721, "learning_rate": 1.029554300474056e-07, "loss": 0.0021, "num_tokens": 41032012.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.9592326139088729, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.04694860184099525, "learning_rate": 1.0203570326541623e-07, "loss": 0.0019, "num_tokens": 41036234.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 300.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.95941708171924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.04015366639941931, "learning_rate": 1.0112008194528067e-07, "loss": 0.0016, "num_tokens": 41045098.0, "reward": 1.921875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.9596015495296071, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.06642091972753406, "learning_rate": 1.0020856646678067e-07, "loss": 0.0027, "num_tokens": 41050956.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 358.125, "completions/mean_terminated_length": 358.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.9597860173399742, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.0835782241774723, "learning_rate": 9.930115720799271e-08, "loss": 0.0033, "num_tokens": 41059901.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.9599704851503412, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.054742117412388325, "learning_rate": 9.839785454529016e-08, "loss": 0.0022, "num_tokens": 41065917.0, "reward": 1.8333333730697632, "reward_std": 0.30860665440559387, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.30860668420791626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.9601549529607084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.050667899660766125, "learning_rate": 9.749865885334442e-08, "loss": 0.002, "num_tokens": 41073886.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.9603394207710755, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.043345407815650105, "learning_rate": 9.660357050512158e-08, "loss": 0.0017, "num_tokens": 41077809.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 304.5, "completions/mean_terminated_length": 304.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.9605238885814426, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.05803462537005544, "learning_rate": 9.571258987188469e-08, "loss": 0.0023, "num_tokens": 41088045.0, "reward": 1.6285715103149414, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6285714507102966, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 358.75, "completions/mean_terminated_length": 358.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9607083563918096, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.031976773985661566, "learning_rate": 9.482571732319479e-08, "loss": 0.0013, "num_tokens": 41099147.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.9608928242021767, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.06101043219678104, "learning_rate": 9.394295322690538e-08, "loss": 0.0024, "num_tokens": 41103548.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9610772920125438, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.057820688700303435, "learning_rate": 9.306429794917027e-08, "loss": 0.0023, "num_tokens": 41111281.0, "reward": 1.8869047164916992, "reward_std": 0.3198816180229187, "rewards/fixed_code_pass_all_test_reward/mean": 0.886904776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.3198816478252411, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.961261759822911, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.07006285199895501, "learning_rate": 9.218975185443458e-08, "loss": 0.0028, "num_tokens": 41120969.0, "reward": 1.4196429252624512, "reward_std": 0.2207774668931961, "rewards/fixed_code_pass_all_test_reward/mean": 0.4196428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.22077752649784088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.961446227633278, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.06840512761846185, "learning_rate": 9.131931530544147e-08, "loss": 0.0027, "num_tokens": 41129818.0, "reward": 1.9848484992980957, "reward_std": 0.009351708926260471, "rewards/fixed_code_pass_all_test_reward/mean": 0.9848484992980957, "rewards/fixed_code_pass_all_test_reward/std": 0.009351727552711964, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 400.625, "completions/mean_terminated_length": 400.625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9616306954436451, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.02520911255851388, "learning_rate": 9.045298866323105e-08, "loss": 0.001, "num_tokens": 41142503.0, "reward": 1.5625, "reward_std": 0.3104097247123718, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.31040969491004944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 340.875, "completions/mean_terminated_length": 340.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.9618151632540122, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.027897749678231776, "learning_rate": 8.959077228713475e-08, "loss": 0.0011, "num_tokens": 41149318.0, "reward": 1.8888888359069824, "reward_std": 0.20573778450489044, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.9619996310643792, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.05896392371505499, "learning_rate": 8.873266653478207e-08, "loss": 0.0024, "num_tokens": 41157435.0, "reward": 1.8418368101119995, "reward_std": 0.3434050977230072, "rewards/fixed_code_pass_all_test_reward/mean": 0.8418367505073547, "rewards/fixed_code_pass_all_test_reward/std": 0.3434050977230072, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.9621840988747463, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.06572333117946982, "learning_rate": 8.78786717620972e-08, "loss": 0.0026, "num_tokens": 41164397.0, "reward": 1.9147727489471436, "reward_std": 0.24105912446975708, "rewards/fixed_code_pass_all_test_reward/mean": 0.9147727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.24105913937091827, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 499.625, "completions/mean_terminated_length": 499.625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.9623685666851135, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.027973042335361242, "learning_rate": 8.702878832329898e-08, "loss": 0.0011, "num_tokens": 41178146.0, "reward": 1.2638888359069824, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.3888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 254.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9625530344954806, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.10097783571109176, "learning_rate": 8.618301657089878e-08, "loss": 0.004, "num_tokens": 41186641.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.9627375023058476, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.05264139058999717, "learning_rate": 8.53413568557071e-08, "loss": 0.0021, "num_tokens": 41194652.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9629219701162147, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.04566327459178865, "learning_rate": 8.450380952682357e-08, "loss": 0.0018, "num_tokens": 41205003.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.9631064379265818, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.0341945510590449, "learning_rate": 8.367037493164699e-08, "loss": 0.0014, "num_tokens": 41210504.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.9632909057369489, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.055573346791788936, "learning_rate": 8.284105341586746e-08, "loss": 0.0022, "num_tokens": 41217870.0, "reward": 1.6048386096954346, "reward_std": 0.4638724625110626, "rewards/fixed_code_pass_all_test_reward/mean": 0.6048386693000793, "rewards/fixed_code_pass_all_test_reward/std": 0.4638724625110626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.9634753735473159, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.04277826158795506, "learning_rate": 8.201584532346873e-08, "loss": 0.0017, "num_tokens": 41227928.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.9636598413576831, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.041442428482696414, "learning_rate": 8.119475099673035e-08, "loss": 0.0017, "num_tokens": 41233492.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.9638443091680502, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.0553272501565516, "learning_rate": 8.037777077622322e-08, "loss": 0.0022, "num_tokens": 41243569.0, "reward": 1.388157844543457, "reward_std": 0.3776371479034424, "rewards/fixed_code_pass_all_test_reward/mean": 0.3881579041481018, "rewards/fixed_code_pass_all_test_reward/std": 0.3776371479034424, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.9640287769784173, "frac_reward_zero_std": 1.0, "grad_norm": 0.26171875, "kl": 0.06600389163941145, "learning_rate": 7.956490500081404e-08, "loss": 0.0026, "num_tokens": 41248348.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.9642132447887843, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "kl": 0.1688280995003879, "learning_rate": 7.8756154007662e-08, "loss": 0.0068, "num_tokens": 41252186.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.9643977125991514, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.07350975181907415, "learning_rate": 7.79515181322199e-08, "loss": 0.0029, "num_tokens": 41260762.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 631.625, "completions/mean_terminated_length": 631.625, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.9645821804095185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.0433313453104347, "learning_rate": 7.715099770823187e-08, "loss": 0.0017, "num_tokens": 41276567.0, "reward": 1.6363636255264282, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 494.0, "completions/mean_terminated_length": 494.0, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.9647666482198857, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.061316598672419786, "learning_rate": 7.635459306773785e-08, "loss": 0.0025, "num_tokens": 41291375.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 480.875, "completions/mean_terminated_length": 480.875, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.9649511160302527, "frac_reward_zero_std": 1.0, "grad_norm": 0.052978515625, "kl": 0.027773830806836486, "learning_rate": 7.556230454106916e-08, "loss": 0.0011, "num_tokens": 41301078.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 217.375, "completions/mean_terminated_length": 217.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.9651355838406198, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.05065850366372615, "learning_rate": 7.477413245684961e-08, "loss": 0.002, "num_tokens": 41308665.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 163.75, "completions/mean_terminated_length": 163.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.9653200516509869, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.06842301262076944, "learning_rate": 7.399007714199658e-08, "loss": 0.0027, "num_tokens": 41312711.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.965504519461354, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0548136472934857, "learning_rate": 7.321013892171657e-08, "loss": 0.0022, "num_tokens": 41321733.0, "reward": 1.78125, "reward_std": 0.6187184453010559, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 504.375, "completions/mean_terminated_length": 504.375, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.965688987271721, "frac_reward_zero_std": 0.0, "grad_norm": 0.86328125, "kl": 0.051703379955142736, "learning_rate": 7.243431811951529e-08, "loss": 0.0021, "num_tokens": 41331488.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 369.5, "completions/mean_terminated_length": 369.5, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.9658734550820882, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.03682604408822954, "learning_rate": 7.166261505718419e-08, "loss": 0.0015, "num_tokens": 41337852.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9660579228924553, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.05162555491551757, "learning_rate": 7.089503005480947e-08, "loss": 0.0021, "num_tokens": 41346162.0, "reward": 1.0089285373687744, "reward_std": 0.025253789499402046, "rewards/fixed_code_pass_all_test_reward/mean": 0.008928571827709675, "rewards/fixed_code_pass_all_test_reward/std": 0.025253813713788986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 242.25, "completions/mean_terminated_length": 242.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.9662423907028224, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.046760416589677334, "learning_rate": 7.013156343076755e-08, "loss": 0.0019, "num_tokens": 41354692.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.9664268585131894, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.061138360761106014, "learning_rate": 6.937221550172957e-08, "loss": 0.0024, "num_tokens": 41364606.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 463.75, "completions/mean_terminated_length": 463.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9666113263235565, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.027933298726566136, "learning_rate": 6.861698658265692e-08, "loss": 0.0011, "num_tokens": 41373100.0, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.9667957941339236, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.07844673749059439, "learning_rate": 6.786587698680014e-08, "loss": 0.0031, "num_tokens": 41382837.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.9669802619442908, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.06733493646606803, "learning_rate": 6.711888702570556e-08, "loss": 0.0027, "num_tokens": 41393322.0, "reward": 1.25, "reward_std": 0.05399487912654877, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.05399492755532265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 239.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.9671647297546578, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.05517291557043791, "learning_rate": 6.637601700920649e-08, "loss": 0.0022, "num_tokens": 41401923.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9673491975650249, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.04998038709163666, "learning_rate": 6.563726724542974e-08, "loss": 0.002, "num_tokens": 41407253.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 166.125, "completions/mean_terminated_length": 166.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.967533665375392, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.03811976104043424, "learning_rate": 6.490263804079466e-08, "loss": 0.0015, "num_tokens": 41414222.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 316.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.967718133185759, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.055065608816221356, "learning_rate": 6.417212970000864e-08, "loss": 0.0022, "num_tokens": 41423208.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9679026009961261, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.06356498971581459, "learning_rate": 6.344574252606928e-08, "loss": 0.0025, "num_tokens": 41433186.0, "reward": 1.1818182468414307, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1818181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 330.75, "completions/mean_terminated_length": 330.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.9680870688064933, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.06978353299200535, "learning_rate": 6.272347682026781e-08, "loss": 0.0028, "num_tokens": 41445048.0, "reward": 1.6636905670166016, "reward_std": 0.18527661263942719, "rewards/fixed_code_pass_all_test_reward/mean": 0.6636905074119568, "rewards/fixed_code_pass_all_test_reward/std": 0.18527662754058838, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.9682715366168604, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.05167241161689162, "learning_rate": 6.200533288218458e-08, "loss": 0.0021, "num_tokens": 41450675.0, "reward": 1.794407844543457, "reward_std": 0.1352747082710266, "rewards/fixed_code_pass_all_test_reward/mean": 0.7944079041481018, "rewards/fixed_code_pass_all_test_reward/std": 0.13527469336986542, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9684560044272275, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.035193565068766475, "learning_rate": 6.12913110096891e-08, "loss": 0.0014, "num_tokens": 41457380.0, "reward": 1.5688774585723877, "reward_std": 0.3863620460033417, "rewards/fixed_code_pass_all_test_reward/mean": 0.6938775777816772, "rewards/fixed_code_pass_all_test_reward/std": 0.30699586868286133, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9686404722375945, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.06888553989119828, "learning_rate": 6.058141149894337e-08, "loss": 0.0028, "num_tokens": 41462593.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.9688249400479616, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.062011873815208673, "learning_rate": 5.987563464439627e-08, "loss": 0.0025, "num_tokens": 41470447.0, "reward": 1.2180233001708984, "reward_std": 0.024666527286171913, "rewards/fixed_code_pass_all_test_reward/mean": 0.21802324056625366, "rewards/fixed_code_pass_all_test_reward/std": 0.024666523560881615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 413.5, "completions/mean_terminated_length": 413.5, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.9690094078583287, "frac_reward_zero_std": 1.0, "grad_norm": 0.053466796875, "kl": 0.029803970246575773, "learning_rate": 5.917398073879144e-08, "loss": 0.0012, "num_tokens": 41479139.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 153.375, "completions/mean_terminated_length": 153.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9691938756686959, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.03398190764710307, "learning_rate": 5.8476450073159385e-08, "loss": 0.0014, "num_tokens": 41484350.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 265.875, "completions/mean_terminated_length": 265.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.9693783434790629, "frac_reward_zero_std": 1.0, "grad_norm": 0.044189453125, "kl": 0.025205335346981883, "learning_rate": 5.77830429368198e-08, "loss": 0.001, "num_tokens": 41493669.0, "reward": 1.399999976158142, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.96956281128943, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.04511664784513414, "learning_rate": 5.7093759617382616e-08, "loss": 0.0018, "num_tokens": 41498324.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.9697472790997971, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.04905323777347803, "learning_rate": 5.640860040074803e-08, "loss": 0.002, "num_tokens": 41507411.0, "reward": 1.1875, "reward_std": 0.13614468276500702, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.1361447423696518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.9699317469101641, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.046028631972149014, "learning_rate": 5.572756557110537e-08, "loss": 0.0018, "num_tokens": 41511574.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.9701162147205312, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.031656795355957, "learning_rate": 5.5050655410934236e-08, "loss": 0.0013, "num_tokens": 41516934.0, "reward": 1.325657844543457, "reward_std": 0.009304000996053219, "rewards/fixed_code_pass_all_test_reward/mean": 0.3256579041481018, "rewards/fixed_code_pass_all_test_reward/std": 0.009304032661020756, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.9703006825308984, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.0665692905895412, "learning_rate": 5.437787020100116e-08, "loss": 0.0027, "num_tokens": 41524408.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9704851503412655, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.05987384798936546, "learning_rate": 5.370921022036291e-08, "loss": 0.0024, "num_tokens": 41536718.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.9706696181516326, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.036329819122329354, "learning_rate": 5.3044675746365406e-08, "loss": 0.0015, "num_tokens": 41541074.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 446.75, "completions/mean_terminated_length": 446.75, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.9708540859619996, "frac_reward_zero_std": 1.0, "grad_norm": 0.058837890625, "kl": 0.035116976825520396, "learning_rate": 5.238426705464372e-08, "loss": 0.0014, "num_tokens": 41551920.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 384.375, "completions/mean_terminated_length": 384.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.9710385537723667, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.033035576168913394, "learning_rate": 5.172798441912097e-08, "loss": 0.0013, "num_tokens": 41563811.0, "reward": 1.9244792461395264, "reward_std": 0.213605135679245, "rewards/fixed_code_pass_all_test_reward/mean": 0.9244791865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.21360516548156738, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9712230215827338, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.061985784326680005, "learning_rate": 5.107582811200829e-08, "loss": 0.0025, "num_tokens": 41571557.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 242.625, "completions/mean_terminated_length": 242.625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.971407489393101, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.0927488929592073, "learning_rate": 5.042779840380596e-08, "loss": 0.0037, "num_tokens": 41576290.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.971591957203468, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.07470339583232999, "learning_rate": 4.978389556330454e-08, "loss": 0.003, "num_tokens": 41583000.0, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9717764250138351, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.08352646324783564, "learning_rate": 4.9144119857579276e-08, "loss": 0.0033, "num_tokens": 41594584.0, "reward": 1.787500023841858, "reward_std": 0.401559442281723, "rewards/fixed_code_pass_all_test_reward/mean": 0.7875000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.401559442281723, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 460.375, "completions/mean_terminated_length": 460.375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.9719608928242022, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.0728385706897825, "learning_rate": 4.850847155199567e-08, "loss": 0.0029, "num_tokens": 41607835.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 197.375, "completions/mean_terminated_length": 197.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.9721453606345692, "frac_reward_zero_std": 0.0, "grad_norm": 5.15625, "kl": 0.04751185933127999, "learning_rate": 4.787695091020616e-08, "loss": 0.0019, "num_tokens": 41616422.0, "reward": 1.817073106765747, "reward_std": 0.11290489137172699, "rewards/fixed_code_pass_all_test_reward/mean": 0.8170731663703918, "rewards/fixed_code_pass_all_test_reward/std": 0.11290489882230759, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 340.375, "completions/mean_terminated_length": 340.375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9723298284449363, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.042368359165266156, "learning_rate": 4.7249558194153445e-08, "loss": 0.0017, "num_tokens": 41626097.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 235.375, "completions/mean_terminated_length": 235.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9725142962553035, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.03237624326720834, "learning_rate": 4.6626293664066016e-08, "loss": 0.0013, "num_tokens": 41635036.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 382.75, "completions/mean_terminated_length": 382.75, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.9726987640656706, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.02342943788971752, "learning_rate": 4.6007157578459304e-08, "loss": 0.001, "num_tokens": 41643442.0, "reward": 1.0125000476837158, "reward_std": 0.005050757899880409, "rewards/fixed_code_pass_all_test_reward/mean": 0.012500000186264515, "rewards/fixed_code_pass_all_test_reward/std": 0.00505076302215457, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.9728832318760376, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.07079569343477488, "learning_rate": 4.539215019414012e-08, "loss": 0.0028, "num_tokens": 41650765.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.9730676996864047, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.044770004926249385, "learning_rate": 4.478127176619662e-08, "loss": 0.0018, "num_tokens": 41660001.0, "reward": 1.864898920059204, "reward_std": 0.3496149182319641, "rewards/fixed_code_pass_all_test_reward/mean": 0.8648989796638489, "rewards/fixed_code_pass_all_test_reward/std": 0.3496149480342865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 411.5, "completions/mean_terminated_length": 411.5, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.9732521674967718, "frac_reward_zero_std": 1.0, "grad_norm": 0.038330078125, "kl": 0.038410266395658255, "learning_rate": 4.417452254801169e-08, "loss": 0.0015, "num_tokens": 41672869.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 336.375, "completions/mean_terminated_length": 336.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.9734366353071389, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.06976801063865423, "learning_rate": 4.3571902791249564e-08, "loss": 0.0028, "num_tokens": 41682120.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 431.0, "completions/mean_terminated_length": 431.0, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.973621103117506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.051772967679426074, "learning_rate": 4.297341274586475e-08, "loss": 0.0021, "num_tokens": 41690728.0, "reward": 1.692307710647583, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.9738055709278731, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.07492655934765935, "learning_rate": 4.237905266009756e-08, "loss": 0.003, "num_tokens": 41698496.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.9739900387382402, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.04101553792133927, "learning_rate": 4.1788822780476355e-08, "loss": 0.0016, "num_tokens": 41703856.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 217.375, "completions/mean_terminated_length": 217.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.9741745065486073, "frac_reward_zero_std": 1.0, "grad_norm": 1.4921875, "kl": 0.21791550051420927, "learning_rate": 4.1202723351815296e-08, "loss": 0.0087, "num_tokens": 41708307.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 5281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 301.0, "completions/mean_terminated_length": 301.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.9743589743589743, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.052525246515870094, "learning_rate": 4.062075461721548e-08, "loss": 0.0021, "num_tokens": 41714987.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 344.75, "completions/mean_terminated_length": 344.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.9745434421693414, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.06057804566808045, "learning_rate": 4.004291681806494e-08, "loss": 0.0024, "num_tokens": 41721161.0, "reward": 1.9249999523162842, "reward_std": 0.10350988060235977, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.1035098284482956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 252.75, "completions/mean_terminated_length": 252.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.9747279099797086, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.08072010613977909, "learning_rate": 3.94692101940386e-08, "loss": 0.0032, "num_tokens": 41729079.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.9749123777900757, "frac_reward_zero_std": 1.0, "grad_norm": 0.197265625, "kl": 0.06285182386636734, "learning_rate": 3.8899634983097236e-08, "loss": 0.0025, "num_tokens": 41738491.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.9750968456004427, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.06004124227911234, "learning_rate": 3.833419142148964e-08, "loss": 0.0024, "num_tokens": 41749560.0, "reward": 1.682795763015747, "reward_std": 0.009955044835805893, "rewards/fixed_code_pass_all_test_reward/mean": 0.6827957034111023, "rewards/fixed_code_pass_all_test_reward/std": 0.009955045767128468, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.9752813134108098, "frac_reward_zero_std": 1.0, "grad_norm": 0.055419921875, "kl": 0.04257624037563801, "learning_rate": 3.7772879743749324e-08, "loss": 0.0017, "num_tokens": 41759437.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 307.0, "completions/mean_terminated_length": 307.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.9754657812211769, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.02888580271974206, "learning_rate": 3.721570018269449e-08, "loss": 0.0012, "num_tokens": 41764581.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 467.75, "completions/mean_terminated_length": 467.75, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.975650249031544, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.022541281825397164, "learning_rate": 3.66626529694325e-08, "loss": 0.0009, "num_tokens": 41775523.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 378.75, "completions/mean_terminated_length": 378.75, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.975834716841911, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.05219962075352669, "learning_rate": 3.6113738333355406e-08, "loss": 0.0021, "num_tokens": 41783145.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 5290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.9760191846522782, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.032305477594491094, "learning_rate": 3.5568956502141094e-08, "loss": 0.0013, "num_tokens": 41789310.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 322.375, "completions/mean_terminated_length": 322.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.9762036524626453, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.08110666554421186, "learning_rate": 3.502830770175214e-08, "loss": 0.0032, "num_tokens": 41796081.0, "reward": 1.6929347515106201, "reward_std": 0.4557724595069885, "rewards/fixed_code_pass_all_test_reward/mean": 0.6929347515106201, "rewards/fixed_code_pass_all_test_reward/std": 0.4557724595069885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.9763881202730124, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.07102947402745485, "learning_rate": 3.449179215644027e-08, "loss": 0.0028, "num_tokens": 41805500.0, "reward": 1.7366070747375488, "reward_std": 0.36568233370780945, "rewards/fixed_code_pass_all_test_reward/mean": 0.7366071343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.36568236351013184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9765725880833794, "frac_reward_zero_std": 1.0, "grad_norm": 0.72265625, "kl": 0.08736635418608785, "learning_rate": 3.39594100887386e-08, "loss": 0.0035, "num_tokens": 41815229.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 304.5, "completions/mean_terminated_length": 304.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.9767570558937465, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.05646216729655862, "learning_rate": 3.3431161719468255e-08, "loss": 0.0023, "num_tokens": 41823625.0, "reward": 1.4375, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 484.125, "completions/mean_terminated_length": 484.125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.9769415237041136, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052734375, "kl": 0.028018782031722367, "learning_rate": 3.290704726773619e-08, "loss": 0.0011, "num_tokens": 41832850.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 240.375, "completions/mean_terminated_length": 240.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9771259915144808, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.07833970058709383, "learning_rate": 3.2387066950932965e-08, "loss": 0.0031, "num_tokens": 41840677.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.9773104593248478, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.02673303720075637, "learning_rate": 3.187122098473383e-08, "loss": 0.0011, "num_tokens": 41850188.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 168.625, "completions/mean_terminated_length": 168.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.9774949271352149, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.05613635154440999, "learning_rate": 3.135950958310319e-08, "loss": 0.0022, "num_tokens": 41854313.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.977679394945582, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.061302305199205875, "learning_rate": 3.0851932958285704e-08, "loss": 0.0025, "num_tokens": 41859690.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.977863862755949, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.04694637353532016, "learning_rate": 3.034849132081519e-08, "loss": 0.0019, "num_tokens": 41864987.0, "reward": 1.894736886024475, "reward_std": 0.2023799568414688, "rewards/fixed_code_pass_all_test_reward/mean": 0.8947368860244751, "rewards/fixed_code_pass_all_test_reward/std": 0.20237997174263, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.9780483305663161, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.03089008352253586, "learning_rate": 2.984918487950683e-08, "loss": 0.0012, "num_tokens": 41873085.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.9782327983766833, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.047973362263292074, "learning_rate": 2.9354013841461638e-08, "loss": 0.0019, "num_tokens": 41878022.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.9784172661870504, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.08903633430600166, "learning_rate": 2.8862978412068643e-08, "loss": 0.0036, "num_tokens": 41887645.0, "reward": 1.521276593208313, "reward_std": 0.46503567695617676, "rewards/fixed_code_pass_all_test_reward/mean": 0.646276593208313, "rewards/fixed_code_pass_all_test_reward/std": 0.3180197775363922, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9786017339974175, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.02976604033028707, "learning_rate": 2.837607879499604e-08, "loss": 0.0012, "num_tokens": 41894733.0, "reward": 1.7901785373687744, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.7901785969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.9787862018077845, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.07160584814846516, "learning_rate": 2.7893315192200066e-08, "loss": 0.0029, "num_tokens": 41901552.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 417.75, "completions/mean_terminated_length": 417.75, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.9789706696181516, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.0279203521204181, "learning_rate": 2.7414687803920536e-08, "loss": 0.0011, "num_tokens": 41910294.0, "reward": 1.8068182468414307, "reward_std": 0.07805723696947098, "rewards/fixed_code_pass_all_test_reward/mean": 0.8068181276321411, "rewards/fixed_code_pass_all_test_reward/std": 0.07805725187063217, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.9791551374285187, "frac_reward_zero_std": 1.0, "grad_norm": 0.1865234375, "kl": 0.05838633142411709, "learning_rate": 2.6940196828681986e-08, "loss": 0.0023, "num_tokens": 41918533.0, "reward": 1.3658536672592163, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3658536672592163, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 195.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.9793396052388859, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.08666689228266478, "learning_rate": 2.646984246329254e-08, "loss": 0.0035, "num_tokens": 41926074.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.9795240730492529, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.14150910871103406, "learning_rate": 2.6003624902846137e-08, "loss": 0.0057, "num_tokens": 41936622.0, "reward": 1.9017857313156128, "reward_std": 0.27779191732406616, "rewards/fixed_code_pass_all_test_reward/mean": 0.9017857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.27779194712638855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 299.625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.97970854085962, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0460062709171325, "learning_rate": 2.5541544340719203e-08, "loss": 0.0018, "num_tokens": 41947899.0, "reward": 1.355769157409668, "reward_std": 0.009065450169146061, "rewards/fixed_code_pass_all_test_reward/mean": 0.35576921701431274, "rewards/fixed_code_pass_all_test_reward/std": 0.009065471589565277, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.9798930086699871, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.07123378361575305, "learning_rate": 2.5083600968572874e-08, "loss": 0.0028, "num_tokens": 41955878.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.9800774764803541, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.06565207708626986, "learning_rate": 2.4629794976351873e-08, "loss": 0.0026, "num_tokens": 41963496.0, "reward": 1.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.9802619442907212, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.04971389891579747, "learning_rate": 2.4180126552284523e-08, "loss": 0.002, "num_tokens": 41973349.0, "reward": 1.6285715103149414, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6285714507102966, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 244.125, "completions/mean_terminated_length": 244.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9804464121010884, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.06059540994465351, "learning_rate": 2.3734595882884957e-08, "loss": 0.0024, "num_tokens": 41987062.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.9806308799114555, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.058712615398690104, "learning_rate": 2.3293203152948697e-08, "loss": 0.0023, "num_tokens": 41991045.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9808153477218226, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.034420679905451834, "learning_rate": 2.285594854555595e-08, "loss": 0.0014, "num_tokens": 42002221.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 398.125, "completions/mean_terminated_length": 398.125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9809998155321896, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.03243247454520315, "learning_rate": 2.2422832242071645e-08, "loss": 0.0013, "num_tokens": 42013854.0, "reward": 1.6136363744735718, "reward_std": 0.41304007172584534, "rewards/fixed_code_pass_all_test_reward/mean": 0.6136363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.4130401015281677, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.9811842833425567, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.08169630402699113, "learning_rate": 2.199385442214208e-08, "loss": 0.0033, "num_tokens": 42020164.0, "reward": 1.0, "reward_std": 0.3499270975589752, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 369.625, "completions/mean_terminated_length": 369.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.9813687511529238, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.061218464747071266, "learning_rate": 2.1569015263697147e-08, "loss": 0.0024, "num_tokens": 42030417.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 231.0, "completions/mean_terminated_length": 231.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.981553218963291, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.06848766561597586, "learning_rate": 2.1148314942952552e-08, "loss": 0.0027, "num_tokens": 42038873.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 217.375, "completions/mean_terminated_length": 217.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.981737686773658, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.05736344517208636, "learning_rate": 2.0731753634405384e-08, "loss": 0.0023, "num_tokens": 42044476.0, "reward": 1.9008620977401733, "reward_std": 0.2804044187068939, "rewards/fixed_code_pass_all_test_reward/mean": 0.9008620977401733, "rewards/fixed_code_pass_all_test_reward/std": 0.2804044485092163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 342.125, "completions/mean_terminated_length": 342.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.9819221545840251, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.05988413421437144, "learning_rate": 2.0319331510835205e-08, "loss": 0.0024, "num_tokens": 42055781.0, "reward": 1.6736111640930176, "reward_std": 0.2059609293937683, "rewards/fixed_code_pass_all_test_reward/mean": 0.6736111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.2059609293937683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 388.25, "completions/mean_terminated_length": 388.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.9821066223943922, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.059233570005744696, "learning_rate": 1.9911048743306293e-08, "loss": 0.0024, "num_tokens": 42067063.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 607.625, "completions/mean_terminated_length": 607.625, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.9822910902047592, "frac_reward_zero_std": 1.0, "grad_norm": 0.216796875, "kl": 0.031761947146151215, "learning_rate": 1.950690550116652e-08, "loss": 0.0013, "num_tokens": 42082532.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 314.625, "completions/mean_terminated_length": 314.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9824755580151263, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.05178978783078492, "learning_rate": 1.910690195204512e-08, "loss": 0.0021, "num_tokens": 42093737.0, "reward": 1.274999976158142, "reward_std": 0.4474797248840332, "rewards/fixed_code_pass_all_test_reward/mean": 0.2750000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.4474797546863556, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 437.125, "completions/mean_terminated_length": 437.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.9826600258254935, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.043250229908153415, "learning_rate": 1.871103826185383e-08, "loss": 0.0017, "num_tokens": 42104802.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 389.75, "completions/mean_terminated_length": 389.75, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.9828444936358606, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.03350448363926262, "learning_rate": 1.8319314594790193e-08, "loss": 0.0013, "num_tokens": 42115840.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 372.75, "completions/mean_terminated_length": 372.75, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.9830289614462276, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.11425920529291034, "learning_rate": 1.7931731113332017e-08, "loss": 0.0046, "num_tokens": 42123694.0, "reward": 0.9886363744735718, "reward_std": 0.40782636404037476, "rewards/fixed_code_pass_all_test_reward/mean": 0.11363636702299118, "rewards/fixed_code_pass_all_test_reward/std": 0.09409985691308975, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.9832134292565947, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.032772727543488145, "learning_rate": 1.7548287978239596e-08, "loss": 0.0013, "num_tokens": 42128875.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.9833978970669618, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.046573208877816796, "learning_rate": 1.7168985348559042e-08, "loss": 0.0019, "num_tokens": 42137925.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 341.875, "completions/mean_terminated_length": 341.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.9835823648773289, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.06786770792677999, "learning_rate": 1.6793823381614506e-08, "loss": 0.0027, "num_tokens": 42147372.0, "reward": 1.579545497894287, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.5795454978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 210.75, "completions/mean_terminated_length": 210.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.983766832687696, "frac_reward_zero_std": 1.0, "grad_norm": 0.059814453125, "kl": 0.031236914568580687, "learning_rate": 1.6422802233017067e-08, "loss": 0.0012, "num_tokens": 42153058.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.9839513004980631, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.06528160441666842, "learning_rate": 1.605592205665696e-08, "loss": 0.0026, "num_tokens": 42161845.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9841357683084302, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.07269561430439353, "learning_rate": 1.5693183004708013e-08, "loss": 0.0029, "num_tokens": 42167571.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.9843202361187973, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.06843814626336098, "learning_rate": 1.5334585227627652e-08, "loss": 0.0027, "num_tokens": 42173297.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 486.375, "completions/mean_terminated_length": 486.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.9845047039291643, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.05768426554277539, "learning_rate": 1.4980128874153566e-08, "loss": 0.0023, "num_tokens": 42185548.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.9846891717395314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.04646958666853607, "learning_rate": 1.4629814091307038e-08, "loss": 0.0019, "num_tokens": 42189636.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.9848736395498986, "frac_reward_zero_std": 1.0, "grad_norm": 0.05712890625, "kl": 0.04909154260531068, "learning_rate": 1.4283641024390726e-08, "loss": 0.002, "num_tokens": 42196872.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 336.5, "completions/mean_terminated_length": 336.5, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.9850581073602657, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.05190237099304795, "learning_rate": 1.3941609816990887e-08, "loss": 0.0021, "num_tokens": 42203332.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.9852425751706327, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.030055111972615123, "learning_rate": 1.3603720610972926e-08, "loss": 0.0012, "num_tokens": 42215015.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 630.125, "completions/mean_terminated_length": 630.125, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.9854270429809998, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.035764947067946196, "learning_rate": 1.3269973546488069e-08, "loss": 0.0014, "num_tokens": 42230968.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.9856115107913669, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.05420084949582815, "learning_rate": 1.2940368761964472e-08, "loss": 0.0022, "num_tokens": 42235339.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.985795978601734, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.07169168815016747, "learning_rate": 1.2614906394118331e-08, "loss": 0.0029, "num_tokens": 42241011.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 463.375, "completions/mean_terminated_length": 463.375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.9859804464121011, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0314541996922344, "learning_rate": 1.2293586577941664e-08, "loss": 0.0013, "num_tokens": 42249678.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9861649142224682, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.037275388080161065, "learning_rate": 1.1976409446713411e-08, "loss": 0.0015, "num_tokens": 42255249.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 360.625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.9863493820328353, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.02300552447559312, "learning_rate": 1.1663375131989452e-08, "loss": 0.0009, "num_tokens": 42263334.0, "reward": 1.5833333730697632, "reward_std": 0.2357023060321808, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 249.5, "completions/mean_terminated_length": 249.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.9865338498432024, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.07441280828788877, "learning_rate": 1.1354483763611478e-08, "loss": 0.003, "num_tokens": 42272706.0, "reward": 1.052884578704834, "reward_std": 0.028616318479180336, "rewards/fixed_code_pass_all_test_reward/mean": 0.05288461968302727, "rewards/fixed_code_pass_all_test_reward/std": 0.028616301715373993, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.9867183176535694, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.10679358430206776, "learning_rate": 1.104973546970034e-08, "loss": 0.0043, "num_tokens": 42276509.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 5349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.9869027854639365, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.04766136524267495, "learning_rate": 1.0749130376659366e-08, "loss": 0.0019, "num_tokens": 42285048.0, "reward": 1.5565476417541504, "reward_std": 0.1329851895570755, "rewards/fixed_code_pass_all_test_reward/mean": 0.5565476417541504, "rewards/fixed_code_pass_all_test_reward/std": 0.1329851597547531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9870872532743037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.06496472703292966, "learning_rate": 1.0452668609172157e-08, "loss": 0.0026, "num_tokens": 42294897.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 358.375, "completions/mean_terminated_length": 358.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.9872717210846708, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0449696535943076, "learning_rate": 1.016035029020479e-08, "loss": 0.0018, "num_tokens": 42302236.0, "reward": 1.7168368101119995, "reward_std": 0.22394250333309174, "rewards/fixed_code_pass_all_test_reward/mean": 0.71683669090271, "rewards/fixed_code_pass_all_test_reward/std": 0.22394247353076935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.9874561888950378, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.12702241726219654, "learning_rate": 9.872175541005835e-09, "loss": 0.0051, "num_tokens": 42311212.0, "reward": 1.587499976158142, "reward_std": 0.3226563632488251, "rewards/fixed_code_pass_all_test_reward/mean": 0.5874999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.32265639305114746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 401.125, "completions/mean_terminated_length": 401.125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9876406567054049, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.027299814217258245, "learning_rate": 9.588144481101902e-09, "loss": 0.0011, "num_tokens": 42319541.0, "reward": 1.3273810148239136, "reward_std": 0.1878000646829605, "rewards/fixed_code_pass_all_test_reward/mean": 0.3273809552192688, "rewards/fixed_code_pass_all_test_reward/std": 0.1878000795841217, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 407.125, "completions/mean_terminated_length": 407.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.987825124515772, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0536393818911165, "learning_rate": 9.308257228305418e-09, "loss": 0.0021, "num_tokens": 42332006.0, "reward": 1.7440476417541504, "reward_std": 0.13287094235420227, "rewards/fixed_code_pass_all_test_reward/mean": 0.7440476417541504, "rewards/fixed_code_pass_all_test_reward/std": 0.13287092745304108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 530.625, "completions/mean_terminated_length": 530.625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.988009592326139, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.024934152490459383, "learning_rate": 9.032513898705741e-09, "loss": 0.001, "num_tokens": 42347611.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 266.375, "completions/mean_terminated_length": 266.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.9881940601365061, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.056074209278449416, "learning_rate": 8.76091460667583e-09, "loss": 0.0022, "num_tokens": 42353814.0, "reward": 1.6683673858642578, "reward_std": 0.5145297646522522, "rewards/fixed_code_pass_all_test_reward/mean": 0.7933673858642578, "rewards/fixed_code_pass_all_test_reward/std": 0.33363938331604004, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 430.125, "completions/mean_terminated_length": 430.125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.9883785279468733, "frac_reward_zero_std": 1.0, "grad_norm": 0.375, "kl": 0.058239422738552094, "learning_rate": 8.493459464868903e-09, "loss": 0.0023, "num_tokens": 42367623.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 539.75, "completions/mean_terminated_length": 539.75, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.9885629957572404, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.027888058917596936, "learning_rate": 8.230148584219555e-09, "loss": 0.0011, "num_tokens": 42383205.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.9887474635676075, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.059167173225432634, "learning_rate": 7.970982073944866e-09, "loss": 0.0024, "num_tokens": 42388980.0, "reward": 1.90816330909729, "reward_std": 0.1388441026210785, "rewards/fixed_code_pass_all_test_reward/mean": 0.90816330909729, "rewards/fixed_code_pass_all_test_reward/std": 0.1388440877199173, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.9889319313779745, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.21827690489590168, "learning_rate": 7.71596004153996e-09, "loss": 0.0087, "num_tokens": 42396465.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 368.0, "completions/mean_terminated_length": 368.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.9891163991883416, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.05542493867687881, "learning_rate": 7.465082592782446e-09, "loss": 0.0022, "num_tokens": 42407033.0, "reward": 1.444892406463623, "reward_std": 0.246940016746521, "rewards/fixed_code_pass_all_test_reward/mean": 0.4448924660682678, "rewards/fixed_code_pass_all_test_reward/std": 0.24694004654884338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 257.125, "completions/mean_terminated_length": 257.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.9893008669987087, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.046531248139217496, "learning_rate": 7.218349831731309e-09, "loss": 0.0019, "num_tokens": 42415434.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 386.625, "completions/mean_terminated_length": 386.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.9894853348090759, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.05426017288118601, "learning_rate": 6.975761860726904e-09, "loss": 0.0022, "num_tokens": 42425791.0, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 417.375, "completions/mean_terminated_length": 417.375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.9896698026194429, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.038181459065526724, "learning_rate": 6.737318780387636e-09, "loss": 0.0015, "num_tokens": 42437930.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 308.5, "completions/mean_terminated_length": 308.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.98985427042981, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.052539577009156346, "learning_rate": 6.503020689615502e-09, "loss": 0.0021, "num_tokens": 42448646.0, "reward": 1.9293477535247803, "reward_std": 0.19983454048633575, "rewards/fixed_code_pass_all_test_reward/mean": 0.929347813129425, "rewards/fixed_code_pass_all_test_reward/std": 0.19983454048633575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 414.0, "completions/mean_terminated_length": 414.0, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.9900387382401771, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.04202340077608824, "learning_rate": 6.272867685592765e-09, "loss": 0.0017, "num_tokens": 42457206.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.9902232060505441, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.07210710505023599, "learning_rate": 6.046859863781951e-09, "loss": 0.0029, "num_tokens": 42467219.0, "reward": 1.6490384340286255, "reward_std": 0.6795023083686829, "rewards/fixed_code_pass_all_test_reward/mean": 0.7740384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.33995521068573, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.9904076738609112, "frac_reward_zero_std": 1.0, "grad_norm": 0.058349609375, "kl": 0.03287680959329009, "learning_rate": 5.824997317924741e-09, "loss": 0.0013, "num_tokens": 42476822.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.9905921416712784, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.08719502389431, "learning_rate": 5.607280140046412e-09, "loss": 0.0035, "num_tokens": 42486541.0, "reward": 1.6666667461395264, "reward_std": 0.35634827613830566, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.35634830594062805, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 381.375, "completions/mean_terminated_length": 381.375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.9907766094816455, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.03093393100425601, "learning_rate": 5.393708420450284e-09, "loss": 0.0012, "num_tokens": 42497584.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.9909610772920125, "frac_reward_zero_std": 1.0, "grad_norm": 0.255859375, "kl": 0.10202880506403744, "learning_rate": 5.1842822477221614e-09, "loss": 0.0041, "num_tokens": 42504862.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9911455451023796, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.05649484531022608, "learning_rate": 4.9790017087270044e-09, "loss": 0.0023, "num_tokens": 42511351.0, "reward": 1.3977272510528564, "reward_std": 0.6973342895507812, "rewards/fixed_code_pass_all_test_reward/mean": 0.5227272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.5136852264404297, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.9913300129127467, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.048411587718874216, "learning_rate": 4.777866888611149e-09, "loss": 0.0019, "num_tokens": 42518868.0, "reward": 1.5372806787490845, "reward_std": 0.7156479954719543, "rewards/fixed_code_pass_all_test_reward/mean": 0.6622806787490845, "rewards/fixed_code_pass_all_test_reward/std": 0.4668823778629303, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.9915144807231138, "frac_reward_zero_std": 1.0, "grad_norm": 0.04833984375, "kl": 0.03704563365317881, "learning_rate": 4.5808778708011925e-09, "loss": 0.0015, "num_tokens": 42527023.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 286.5, "completions/mean_terminated_length": 286.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.991698948533481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.04198466241359711, "learning_rate": 4.388034737002889e-09, "loss": 0.0017, "num_tokens": 42535947.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.991883416343848, "frac_reward_zero_std": 1.0, "grad_norm": 0.2255859375, "kl": 0.06577351968735456, "learning_rate": 4.199337567203365e-09, "loss": 0.0026, "num_tokens": 42540114.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.9920678841542151, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.1030859723687172, "learning_rate": 4.014786439672236e-09, "loss": 0.0041, "num_tokens": 42547488.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 618.625, "completions/mean_terminated_length": 618.625, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.9922523519645822, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.03241926361806691, "learning_rate": 3.834381430954936e-09, "loss": 0.0013, "num_tokens": 42559013.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 404.0, "completions/mean_terminated_length": 404.0, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.9924368197749492, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.05305333889555186, "learning_rate": 3.6581226158804994e-09, "loss": 0.0021, "num_tokens": 42570805.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.9926212875853163, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.06624983111396432, "learning_rate": 3.486010067557111e-09, "loss": 0.0027, "num_tokens": 42578361.0, "reward": 1.7702702283859253, "reward_std": 0.24559147655963898, "rewards/fixed_code_pass_all_test_reward/mean": 0.7702702283859253, "rewards/fixed_code_pass_all_test_reward/std": 0.2455914318561554, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.9928057553956835, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.04608732624910772, "learning_rate": 3.318043857374331e-09, "loss": 0.0018, "num_tokens": 42582551.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 246.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.9929902232060506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.04503745771944523, "learning_rate": 3.1542240549986557e-09, "loss": 0.0018, "num_tokens": 42587733.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.9931746910164176, "frac_reward_zero_std": 1.0, "grad_norm": 0.203125, "kl": 0.04525235015898943, "learning_rate": 2.9945507283812848e-09, "loss": 0.0018, "num_tokens": 42598160.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.9933591588267847, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.052492949878796935, "learning_rate": 2.839023943750352e-09, "loss": 0.0021, "num_tokens": 42607310.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 267.125, "completions/mean_terminated_length": 267.125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9935436266371518, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.046862140763551, "learning_rate": 2.6876437656153663e-09, "loss": 0.0019, "num_tokens": 42612847.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.9937280944475189, "frac_reward_zero_std": 1.0, "grad_norm": 0.640625, "kl": 0.1489154496230185, "learning_rate": 2.540410256764991e-09, "loss": 0.006, "num_tokens": 42619323.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 323.75, "completions/mean_terminated_length": 323.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.993912562257886, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.05827320017851889, "learning_rate": 2.3973234782681542e-09, "loss": 0.0023, "num_tokens": 42627697.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 425.25, "completions/mean_terminated_length": 425.25, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.9940970300682531, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.11272447369992733, "learning_rate": 2.258383489475158e-09, "loss": 0.0045, "num_tokens": 42635619.0, "reward": 1.6071429252624512, "reward_std": 0.27795591950416565, "rewards/fixed_code_pass_all_test_reward/mean": 0.6071428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.27795588970184326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.9942814978786202, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.06275238865055144, "learning_rate": 2.1235903480154587e-09, "loss": 0.0025, "num_tokens": 42644989.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.9944659656889873, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.060752252116799355, "learning_rate": 1.9929441097976677e-09, "loss": 0.0024, "num_tokens": 42654261.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.9946504334993543, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.04421283898409456, "learning_rate": 1.8664448290106608e-09, "loss": 0.0018, "num_tokens": 42663835.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.9948349013097214, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.05447358335368335, "learning_rate": 1.7440925581246882e-09, "loss": 0.0022, "num_tokens": 42673736.0, "reward": 1.2861841917037964, "reward_std": 0.28842511773109436, "rewards/fixed_code_pass_all_test_reward/mean": 0.2861842215061188, "rewards/fixed_code_pass_all_test_reward/std": 0.28842514753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 379.375, "completions/mean_terminated_length": 379.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.9950193691200886, "frac_reward_zero_std": 1.0, "grad_norm": 0.05029296875, "kl": 0.03242441266775131, "learning_rate": 1.625887347886934e-09, "loss": 0.0013, "num_tokens": 42681923.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9952038369304557, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.03542395122349262, "learning_rate": 1.5118292473292883e-09, "loss": 0.0014, "num_tokens": 42688767.0, "reward": 1.7806122303009033, "reward_std": 0.18015749752521515, "rewards/fixed_code_pass_all_test_reward/mean": 0.7806122303009033, "rewards/fixed_code_pass_all_test_reward/std": 0.18015746772289276, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9953883047408227, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.03936150576919317, "learning_rate": 1.401918303758354e-09, "loss": 0.0016, "num_tokens": 42697204.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 392.5, "completions/mean_terminated_length": 392.5, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.9955727725511898, "frac_reward_zero_std": 1.0, "grad_norm": 0.0137939453125, "kl": 0.02056081802584231, "learning_rate": 1.296154562763219e-09, "loss": 0.0008, "num_tokens": 42706128.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 378.625, "completions/mean_terminated_length": 378.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.9957572403615569, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.04551311954855919, "learning_rate": 1.1945380682132357e-09, "loss": 0.0018, "num_tokens": 42712317.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 247.25, "completions/mean_terminated_length": 247.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.995941708171924, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.09185827895998955, "learning_rate": 1.097068862254691e-09, "loss": 0.0037, "num_tokens": 42722607.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.9961261759822911, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.058558444026857615, "learning_rate": 1.0037469853185767e-09, "loss": 0.0023, "num_tokens": 42728117.0, "reward": 1.5909090042114258, "reward_std": 0.45584234595298767, "rewards/fixed_code_pass_all_test_reward/mean": 0.5909091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.4558423161506653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.9963106437926582, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.03864565561525524, "learning_rate": 9.145724761105979e-10, "loss": 0.0015, "num_tokens": 42739572.0, "reward": 1.6136363744735718, "reward_std": 0.26055067777633667, "rewards/fixed_code_pass_all_test_reward/mean": 0.6136363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.26055067777633667, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.9964951116030253, "frac_reward_zero_std": 0.0, "grad_norm": 3.203125, "kl": 0.0810376862064004, "learning_rate": 8.295453716200552e-10, "loss": 0.0032, "num_tokens": 42743532.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 288.625, "completions/mean_terminated_length": 288.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.9966795794133924, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.0416314018657431, "learning_rate": 7.486657071120729e-10, "loss": 0.0017, "num_tokens": 42753585.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.9968640472237594, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.06281688529998064, "learning_rate": 6.719335161364804e-10, "loss": 0.0025, "num_tokens": 42761303.0, "reward": 1.8851351737976074, "reward_std": 0.21268843114376068, "rewards/fixed_code_pass_all_test_reward/mean": 0.8851351141929626, "rewards/fixed_code_pass_all_test_reward/std": 0.2126884162425995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 306.5, "completions/mean_terminated_length": 306.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.9970485150341265, "frac_reward_zero_std": 1.0, "grad_norm": 0.04541015625, "kl": 0.014927109121344984, "learning_rate": 5.993488305178208e-10, "loss": 0.0006, "num_tokens": 42768843.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.9972329828444937, "frac_reward_zero_std": 0.0, "grad_norm": 3.640625, "kl": 0.11465034121647477, "learning_rate": 5.309116803642323e-10, "loss": 0.0046, "num_tokens": 42775408.0, "reward": 1.5394736528396606, "reward_std": 0.503057599067688, "rewards/fixed_code_pass_all_test_reward/mean": 0.5394736528396606, "rewards/fixed_code_pass_all_test_reward/std": 0.5030575394630432, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 246.875, "completions/mean_terminated_length": 246.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.9974174506548608, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.05233809817582369, "learning_rate": 4.666220940618971e-10, "loss": 0.0021, "num_tokens": 42781399.0, "reward": 1.640625, "reward_std": 0.4033204913139343, "rewards/fixed_code_pass_all_test_reward/mean": 0.765625, "rewards/fixed_code_pass_all_test_reward/std": 0.32346823811531067, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 559.75, "completions/mean_terminated_length": 559.75, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.9976019184652278, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.046522683813236654, "learning_rate": 4.0648009827504166e-10, "loss": 0.0019, "num_tokens": 42790869.0, "reward": 1.0250000953674316, "reward_std": 0.37701839208602905, "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.053452249616384506, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9977863862755949, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.052476686192676425, "learning_rate": 3.504857179514876e-10, "loss": 0.0021, "num_tokens": 42798333.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.997970854085962, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.05781742790713906, "learning_rate": 2.9863897631488004e-10, "loss": 0.0023, "num_tokens": 42808475.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.998155321896329, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.09369877772405744, "learning_rate": 2.509398948713493e-10, "loss": 0.0037, "num_tokens": 42814656.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.9983397897066962, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.05621596262790263, "learning_rate": 2.073884934039594e-10, "loss": 0.0022, "num_tokens": 42819791.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 5412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 411.125, "completions/mean_terminated_length": 411.125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.9985242575170633, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.030041843187063932, "learning_rate": 1.679847899782594e-10, "loss": 0.0012, "num_tokens": 42828592.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9987087253274304, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.0664087226614356, "learning_rate": 1.3272880093784246e-10, "loss": 0.0027, "num_tokens": 42839317.0, "reward": 1.2857142686843872, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.9988931931377975, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.05534454435110092, "learning_rate": 1.0162054090545604e-10, "loss": 0.0022, "num_tokens": 42844187.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 321.25, "completions/mean_terminated_length": 321.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.9990776609481645, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0772966630756855, "learning_rate": 7.466002278522233e-11, "loss": 0.0031, "num_tokens": 42853669.0, "reward": 1.6785714626312256, "reward_std": 0.3949388265609741, "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.3949388265609741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.9992621287585316, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.044086889014579356, "learning_rate": 5.1847257758197433e-11, "loss": 0.0018, "num_tokens": 42865224.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.9994465965688988, "frac_reward_zero_std": 1.0, "grad_norm": 0.059814453125, "kl": 0.028363195713609457, "learning_rate": 3.31822552879224e-11, "loss": 0.0011, "num_tokens": 42871766.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 360.0, "completions/mean_terminated_length": 360.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9996310643792659, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.06553801940754056, "learning_rate": 1.8665023114872172e-11, "loss": 0.0026, "num_tokens": 42878406.0, "reward": 1.453125, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.453125, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 217.625, "completions/mean_terminated_length": 217.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.9998155321896329, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.1058179596439004, "learning_rate": 8.295567263116866e-12, "loss": 0.0042, "num_tokens": 42886331.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 314.75, "completions/mean_terminated_length": 314.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 1.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.062400333350524306, "learning_rate": 2.073892030329816e-12, "loss": 0.0025, "num_tokens": 42892969.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5421 } ], "logging_steps": 1, "max_steps": 5421, "num_input_tokens_seen": 42892969, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }