{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1473, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 633.125, "completions/mean_terminated_length": 633.125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.40213863365352154, "epoch": 0.0006788866259334691, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 8665.0, "reward": 1.4499999284744263, "reward_std": 0.978336751461029, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 462.0, "completions/mean_terminated_length": 462.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.48721940256655216, "epoch": 0.0013577732518669382, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "learning_rate": 1.3513513513513515e-07, "loss": 0.0, "num_tokens": 15553.0, "reward": 1.0833333730697632, "reward_std": 0.8355209231376648, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23754701018333435, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 383.125, "completions/mean_terminated_length": 383.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.2951512522995472, "epoch": 0.002036659877800407, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "learning_rate": 2.702702702702703e-07, "loss": 0.0, "num_tokens": 21634.0, "reward": 1.0, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 588.25, "completions/mean_terminated_length": 588.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.45288635790348053, "epoch": 0.0027155465037338763, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "learning_rate": 4.0540540540540546e-07, "loss": 0.0, "num_tokens": 29468.0, "reward": 0.550000011920929, "reward_std": 0.6023762226104736, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 467.25, "completions/mean_terminated_length": 467.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.3951845243573189, "epoch": 0.0033944331296673455, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "learning_rate": 5.405405405405406e-07, "loss": 0.0, "num_tokens": 36478.0, "reward": 1.0833333730697632, "reward_std": 0.7292092442512512, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0833333358168602, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022911310196, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 571.125, "completions/mean_terminated_length": 571.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6701111160218716, "epoch": 0.004073319755600814, "frac_reward_zero_std": 0.0, "grad_norm": 66.5, "learning_rate": 6.756756756756758e-07, "loss": -0.0, "num_tokens": 44455.0, "reward": 0.5357142686843872, "reward_std": 0.7889543771743774, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0357142873108387, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10101525485515594, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 357.0, "completions/mean_terminated_length": 357.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5524849742650986, "epoch": 0.0047522063815342835, "frac_reward_zero_std": 0.0, "grad_norm": 123.5, "learning_rate": 8.108108108108109e-07, "loss": 0.0, "num_tokens": 50351.0, "reward": 1.0729167461395264, "reward_std": 0.8699543476104736, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 781.0, "completions/mean_terminated_length": 781.0, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "entropy": 0.4235018901526928, "epoch": 0.005431093007467753, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "learning_rate": 9.459459459459461e-07, "loss": -0.0, "num_tokens": 60079.0, "reward": 0.1875, "reward_std": 0.5303300619125366, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 606.125, "completions/mean_terminated_length": 606.125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "entropy": 0.3649282669648528, "epoch": 0.006109979633401222, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "learning_rate": 1.0810810810810812e-06, "loss": 0.0, "num_tokens": 68432.0, "reward": 1.2000000476837158, "reward_std": 1.025740146636963, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20000000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3082207143306732, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.353199296630919, "epoch": 0.006788866259334691, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "learning_rate": 1.2162162162162164e-06, "loss": 0.0, "num_tokens": 73469.0, "reward": 1.6979167461395264, "reward_std": 0.7765633463859558, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1979166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.30190369486808777, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 846.875, "completions/mean_terminated_length": 846.875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 0.1927033788524568, "epoch": 0.00746775288526816, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "learning_rate": 1.3513513513513515e-06, "loss": -0.0, "num_tokens": 86140.0, "reward": 0.6607142686843872, "reward_std": 0.3142625093460083, "rewards/fixed_code_pass_all_test_reward/mean": 0.6607142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.3142625391483307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 588.25, "completions/mean_terminated_length": 588.25, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.45569038949906826, "epoch": 0.008146639511201629, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "learning_rate": 1.4864864864864868e-06, "loss": -0.0, "num_tokens": 94262.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 345.125, "completions/mean_terminated_length": 345.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.425748098641634, "epoch": 0.008825526137135099, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "learning_rate": 1.6216216216216219e-06, "loss": 0.0, "num_tokens": 100047.0, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 365.0, "completions/mean_terminated_length": 365.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.3499553017318249, "epoch": 0.009504412763068567, "frac_reward_zero_std": 0.0, "grad_norm": 13.75, "learning_rate": 1.756756756756757e-06, "loss": -0.0, "num_tokens": 106583.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 841.625, "completions/mean_terminated_length": 841.625, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "entropy": 0.34041350334882736, "epoch": 0.010183299389002037, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "learning_rate": 1.8918918918918922e-06, "loss": 0.0, "num_tokens": 116532.0, "reward": 0.559374988079071, "reward_std": 0.8740236759185791, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05937499925494194, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13224539160728455, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 422.25, "completions/mean_terminated_length": 422.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.34066037461161613, "epoch": 0.010862186014935505, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "learning_rate": 2.0270270270270273e-06, "loss": 0.0, "num_tokens": 123006.0, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 423.0, "completions/mean_terminated_length": 423.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.41253336891531944, "epoch": 0.011541072640868975, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "learning_rate": 2.1621621621621623e-06, "loss": 0.0, "num_tokens": 129830.0, "reward": 1.03125, "reward_std": 0.60411536693573, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 527.625, "completions/mean_terminated_length": 527.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5557551775127649, "epoch": 0.012219959266802444, "frac_reward_zero_std": 0.0, "grad_norm": 31.5, "learning_rate": 2.297297297297298e-06, "loss": -0.0, "num_tokens": 137259.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1220.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 787.625, "completions/mean_terminated_length": 787.625, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "entropy": 0.4465682953596115, "epoch": 0.012898845892735914, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "learning_rate": 2.432432432432433e-06, "loss": -0.0, "num_tokens": 149472.0, "reward": 0.6120129823684692, "reward_std": 0.5126845836639404, "rewards/fixed_code_pass_all_test_reward/mean": 0.3214285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.21257825195789337, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.040584415197372437, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0758657306432724, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 504.125, "completions/mean_terminated_length": 504.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.4042566269636154, "epoch": 0.013577732518669382, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "learning_rate": 2.5675675675675675e-06, "loss": -0.0, "num_tokens": 156881.0, "reward": 1.379166603088379, "reward_std": 0.6281586289405823, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04583333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08533315360546112, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 420.125, "completions/mean_terminated_length": 420.125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.19113011565059423, "epoch": 0.014256619144602852, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "learning_rate": 2.702702702702703e-06, "loss": -0.0, "num_tokens": 164658.0, "reward": 1.4261903762817383, "reward_std": 0.8017059564590454, "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.36154431104660034, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1011904776096344, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11418647319078445, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 551.625, "completions/mean_terminated_length": 551.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.4456657748669386, "epoch": 0.01493550577053632, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "learning_rate": 2.837837837837838e-06, "loss": -0.0, "num_tokens": 172447.0, "reward": 0.5625, "reward_std": 0.6232117414474487, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 709.0, "completions/mean_terminated_length": 709.0, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.3877583211287856, "epoch": 0.015614392396469789, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "learning_rate": 2.9729729729729736e-06, "loss": 0.0, "num_tokens": 183479.0, "reward": 0.28125, "reward_std": 0.5250425338745117, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 909.5, "completions/mean_terminated_length": 909.5, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "entropy": 0.43758431635797024, "epoch": 0.016293279022403257, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1081081081081082e-06, "loss": 0.0, "num_tokens": 194547.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 494.25, "completions/mean_terminated_length": 494.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4515352062880993, "epoch": 0.01697216564833673, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "learning_rate": 3.2432432432432437e-06, "loss": -0.0, "num_tokens": 201741.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 449.625, "completions/mean_terminated_length": 449.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.2880000276491046, "epoch": 0.017651052274270197, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "learning_rate": 3.3783783783783788e-06, "loss": 0.0, "num_tokens": 209866.0, "reward": 1.4809027910232544, "reward_std": 0.9505121111869812, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3399054706096649, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1684027761220932, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.257219135761261, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 420.0, "completions/mean_terminated_length": 420.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6037653535604477, "epoch": 0.018329938900203666, "frac_reward_zero_std": 0.0, "grad_norm": 139.0, "learning_rate": 3.513513513513514e-06, "loss": 0.0, "num_tokens": 216490.0, "reward": 1.0833333730697632, "reward_std": 0.7292092442512512, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0833333358168602, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022911310196, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 775.875, "completions/mean_terminated_length": 775.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.4370947852730751, "epoch": 0.019008825526137134, "frac_reward_zero_std": 0.0, "grad_norm": 6.53125, "learning_rate": 3.648648648648649e-06, "loss": 0.0, "num_tokens": 226937.0, "reward": 0.535714328289032, "reward_std": 0.7071068286895752, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 530.75, "completions/mean_terminated_length": 530.75, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.444356651045382, "epoch": 0.019687712152070606, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "learning_rate": 3.7837837837837844e-06, "loss": 0.0, "num_tokens": 234879.0, "reward": 1.2026515007019043, "reward_std": 0.9371897578239441, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07765151560306549, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16307373344898224, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 385.375, "completions/mean_terminated_length": 385.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.48405807465314865, "epoch": 0.020366598778004074, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "learning_rate": 3.918918918918919e-06, "loss": -0.0, "num_tokens": 241018.0, "reward": 0.78125, "reward_std": 0.4898523688316345, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 373.125, "completions/mean_terminated_length": 373.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.46748646907508373, "epoch": 0.021045485403937542, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "learning_rate": 4.0540540540540545e-06, "loss": -0.0, "num_tokens": 247091.0, "reward": 1.1875, "reward_std": 0.752970278263092, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 962.625, "completions/mean_terminated_length": 962.625, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "entropy": 0.41946573927998543, "epoch": 0.02172437202987101, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "learning_rate": 4.189189189189189e-06, "loss": 0.0, "num_tokens": 261432.0, "reward": 1.0071429014205933, "reward_std": 0.6755626201629639, "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.33284708857536316, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04285714402794838, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08081220835447311, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 592.75, "completions/mean_terminated_length": 592.75, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "entropy": 0.6089797131717205, "epoch": 0.02240325865580448, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "learning_rate": 4.324324324324325e-06, "loss": -0.0, "num_tokens": 269414.0, "reward": 0.78125, "reward_std": 0.4898523688316345, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 682.875, "completions/mean_terminated_length": 682.875, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 0.5007057599723339, "epoch": 0.02308214528173795, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "learning_rate": 4.45945945945946e-06, "loss": 0.0, "num_tokens": 278229.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3946137595921755, "epoch": 0.02376103190767142, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "learning_rate": 4.594594594594596e-06, "loss": -0.0, "num_tokens": 284113.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 671.125, "completions/mean_terminated_length": 671.125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "entropy": 0.41237878799438477, "epoch": 0.024439918533604887, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "learning_rate": 4.72972972972973e-06, "loss": -0.0, "num_tokens": 293930.0, "reward": 1.0499999523162842, "reward_std": 0.4750939905643463, "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.36936238408088684, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 564.375, "completions/mean_terminated_length": 564.375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 0.42290690913796425, "epoch": 0.025118805159538356, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "learning_rate": 4.864864864864866e-06, "loss": 0.0, "num_tokens": 301701.0, "reward": 1.1916667222976685, "reward_std": 0.7487027049064636, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06666667014360428, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12848322093486786, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.41047218441963196, "epoch": 0.025797691785471828, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "learning_rate": 5e-06, "loss": -0.0, "num_tokens": 307246.0, "reward": 0.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 417.25, "completions/mean_terminated_length": 417.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.322323614731431, "epoch": 0.026476578411405296, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "learning_rate": 5.135135135135135e-06, "loss": -0.0, "num_tokens": 313680.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 599.5, "completions/mean_terminated_length": 599.5, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.5022741565480828, "epoch": 0.027155465037338764, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "learning_rate": 5.2702702702702705e-06, "loss": 0.0, "num_tokens": 321756.0, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 332.75, "completions/mean_terminated_length": 332.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.31443312019109726, "epoch": 0.027834351663272233, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "learning_rate": 5.405405405405406e-06, "loss": 0.0, "num_tokens": 327602.0, "reward": 0.78125, "reward_std": 0.7727212905883789, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 698.5, "completions/mean_terminated_length": 698.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.4371532369405031, "epoch": 0.028513238289205704, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "learning_rate": 5.540540540540541e-06, "loss": 0.0, "num_tokens": 336894.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 384.375, "completions/mean_terminated_length": 384.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.36207926645874977, "epoch": 0.029192124915139173, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "learning_rate": 5.675675675675676e-06, "loss": -0.0, "num_tokens": 344273.0, "reward": 1.0809524059295654, "reward_std": 0.6699240803718567, "rewards/fixed_code_pass_all_test_reward/mean": 0.32499998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.4652188718318939, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.130952388048172, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1813279390335083, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 552.25, "completions/mean_terminated_length": 552.25, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "entropy": 0.3683111499994993, "epoch": 0.02987101154107264, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "learning_rate": 5.810810810810811e-06, "loss": -0.0, "num_tokens": 352795.0, "reward": 1.359375, "reward_std": 0.4974825084209442, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.015625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 298.375, "completions/mean_terminated_length": 298.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.42081066966056824, "epoch": 0.03054989816700611, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "learning_rate": 5.945945945945947e-06, "loss": -0.0, "num_tokens": 358422.0, "reward": 0.675000011920929, "reward_std": 0.8137215971946716, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 590.375, "completions/mean_terminated_length": 590.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.3734747637063265, "epoch": 0.031228784792939578, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "learning_rate": 6.081081081081082e-06, "loss": -0.0, "num_tokens": 366513.0, "reward": 1.056249976158142, "reward_std": 0.606769859790802, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05624999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10500850528478622, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 758.625, "completions/mean_terminated_length": 758.625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.2996700187213719, "epoch": 0.031907671418873046, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "learning_rate": 6.2162162162162164e-06, "loss": 0.0, "num_tokens": 377510.0, "reward": 0.8854166865348816, "reward_std": 0.7694333791732788, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.47715675830841064, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10803020745515823, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 481.625, "completions/mean_terminated_length": 481.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.43572198040783405, "epoch": 0.032586558044806514, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "learning_rate": 6.351351351351351e-06, "loss": 0.0, "num_tokens": 384859.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 508.5, "completions/mean_terminated_length": 508.5, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.18101190589368343, "epoch": 0.03326544467073999, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "learning_rate": 6.486486486486487e-06, "loss": -0.0, "num_tokens": 394199.0, "reward": 1.1744047403335571, "reward_std": 0.5458797216415405, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.29940474033355713, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29570335149765015, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.24147845432162285, "epoch": 0.03394433129667346, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "learning_rate": 6.621621621621622e-06, "loss": 0.0, "num_tokens": 398901.0, "reward": 1.3416666984558105, "reward_std": 1.0042370557785034, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20158106088638306, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 390.25, "completions/mean_terminated_length": 390.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.332314838655293, "epoch": 0.034623217922606926, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "learning_rate": 6.7567567567567575e-06, "loss": -0.0, "num_tokens": 406047.0, "reward": 1.066666603088379, "reward_std": 0.49022185802459717, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 358.25, "completions/mean_terminated_length": 358.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.24127027858048677, "epoch": 0.035302104548540394, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "learning_rate": 6.891891891891892e-06, "loss": -0.0, "num_tokens": 412489.0, "reward": 1.4166666269302368, "reward_std": 0.7292091250419617, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02083333395421505, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 363.75, "completions/mean_terminated_length": 363.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.18655581679195166, "epoch": 0.03598099117447386, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "learning_rate": 7.027027027027028e-06, "loss": -0.0, "num_tokens": 420511.0, "reward": 1.4736607074737549, "reward_std": 0.9764032959938049, "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.43490222096443176, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13437500596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20219223201274872, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 466.125, "completions/mean_terminated_length": 466.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.2667345628142357, "epoch": 0.03665987780040733, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "learning_rate": 7.162162162162163e-06, "loss": -0.0, "num_tokens": 427328.0, "reward": 1.910416603088379, "reward_std": 0.5594737529754639, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16041666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.175693079829216, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 381.25, "completions/mean_terminated_length": 381.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.27860070299357176, "epoch": 0.0373387644263408, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "learning_rate": 7.297297297297298e-06, "loss": -0.0, "num_tokens": 433586.0, "reward": 1.4723213911056519, "reward_std": 1.0072075128555298, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22232142090797424, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17692866921424866, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 519.0, "completions/mean_terminated_length": 519.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.4064774829894304, "epoch": 0.03801765105227427, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "learning_rate": 7.4324324324324324e-06, "loss": 0.0, "num_tokens": 440834.0, "reward": 0.47142860293388367, "reward_std": 0.655632734298706, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09642857313156128, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15569837391376495, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 474.75, "completions/mean_terminated_length": 474.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.2492697569541633, "epoch": 0.038696537678207736, "frac_reward_zero_std": 0.0, "grad_norm": 158.0, "learning_rate": 7.567567567567569e-06, "loss": -0.0, "num_tokens": 450936.0, "reward": 1.7020833492279053, "reward_std": 0.8942691683769226, "rewards/fixed_code_pass_all_test_reward/mean": 0.7250000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22708332538604736, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2007797509431839, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 282.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2521714773029089, "epoch": 0.03937542430414121, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "learning_rate": 7.702702702702704e-06, "loss": 0.0, "num_tokens": 456534.0, "reward": 2.258333444595337, "reward_std": 1.1282448768615723, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6333333253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4201284646987915, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 105.625, "completions/mean_terminated_length": 105.625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.08102542627602816, "epoch": 0.04005431093007468, "frac_reward_zero_std": 0.0, "grad_norm": 3.625, "learning_rate": 7.837837837837838e-06, "loss": -0.0, "num_tokens": 460739.0, "reward": 2.4375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.13168837316334248, "epoch": 0.04073319755600815, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "learning_rate": 7.972972972972974e-06, "loss": -0.0, "num_tokens": 466835.0, "reward": 0.8357142806053162, "reward_std": 0.6430158615112305, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.21380901336669922, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06071428954601288, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11473128199577332, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.17302474100142717, "epoch": 0.041412084181941616, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "learning_rate": 8.108108108108109e-06, "loss": -0.0, "num_tokens": 474039.0, "reward": 0.8999999761581421, "reward_std": 0.37032803893089294, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 283.25, "completions/mean_terminated_length": 283.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.09238436678424478, "epoch": 0.042090970807875085, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "learning_rate": 8.243243243243245e-06, "loss": -0.0, "num_tokens": 480905.0, "reward": 1.488541603088379, "reward_std": 0.7430623173713684, "rewards/fixed_code_pass_all_test_reward/mean": 0.484375, "rewards/fixed_code_pass_all_test_reward/std": 0.2259652018547058, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2541666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19755448400974274, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 372.875, "completions/mean_terminated_length": 372.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.06192499818280339, "epoch": 0.04276985743380855, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "learning_rate": 8.378378378378378e-06, "loss": 0.0, "num_tokens": 489120.0, "reward": 1.954545497894287, "reward_std": 0.04859290271997452, "rewards/fixed_code_pass_all_test_reward/mean": 0.9545454978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.0485929399728775, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.15581836737692356, "epoch": 0.04344874405974202, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "learning_rate": 8.513513513513514e-06, "loss": 0.0, "num_tokens": 493837.0, "reward": 2.214583396911621, "reward_std": 0.654498279094696, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4645833671092987, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2778085470199585, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 496.5, "completions/mean_terminated_length": 496.5, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.23688332177698612, "epoch": 0.04412763068567549, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "learning_rate": 8.64864864864865e-06, "loss": 0.0, "num_tokens": 502857.0, "reward": 1.0428571701049805, "reward_std": 0.0808122381567955, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04285714402794838, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08081220835447311, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 323.25, "completions/mean_terminated_length": 323.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.15722941514104605, "epoch": 0.04480651731160896, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "learning_rate": 8.783783783783785e-06, "loss": 0.0, "num_tokens": 509059.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.15436407551169395, "epoch": 0.04548540393754243, "frac_reward_zero_std": 0.0, "grad_norm": 3.625, "learning_rate": 8.91891891891892e-06, "loss": 0.0, "num_tokens": 513500.0, "reward": 2.2875001430511475, "reward_std": 0.5453810095787048, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4125000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23363077640533447, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.11615594383329153, "epoch": 0.0461642905634759, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.054054054054054e-06, "loss": 0.0, "num_tokens": 517608.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 469.25, "completions/mean_terminated_length": 469.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.13107811007648706, "epoch": 0.04684317718940937, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "learning_rate": 9.189189189189191e-06, "loss": 0.0, "num_tokens": 526130.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.15522114373743534, "epoch": 0.04752206381534284, "frac_reward_zero_std": 0.0, "grad_norm": 3.5625, "learning_rate": 9.324324324324325e-06, "loss": 0.0, "num_tokens": 530474.0, "reward": 1.5, "reward_std": 0.4960158169269562, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19416078925132751, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 134.5, "completions/mean_terminated_length": 134.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.16009862814098597, "epoch": 0.048200950441276307, "frac_reward_zero_std": 0.0, "grad_norm": 37.0, "learning_rate": 9.45945945945946e-06, "loss": -0.0, "num_tokens": 534606.0, "reward": 1.837499976158142, "reward_std": 0.763333797454834, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.13566663209348917, "epoch": 0.048879837067209775, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "learning_rate": 9.594594594594594e-06, "loss": -0.0, "num_tokens": 540146.0, "reward": 1.4553570747375488, "reward_std": 0.629321813583374, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0803571417927742, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1157275140285492, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 161.125, "completions/mean_terminated_length": 161.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.21239402517676353, "epoch": 0.04955872369314324, "frac_reward_zero_std": 0.0, "grad_norm": 4.84375, "learning_rate": 9.729729729729732e-06, "loss": -0.0, "num_tokens": 544683.0, "reward": 1.4562499523162842, "reward_std": 0.6668415069580078, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20625001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25006943941116333, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.08463525306433439, "epoch": 0.05023761031907671, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "learning_rate": 9.864864864864865e-06, "loss": 0.0, "num_tokens": 549878.0, "reward": 2.0833334922790527, "reward_std": 0.46929535269737244, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18898224830627441, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.1347042741253972, "epoch": 0.05091649694501019, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "learning_rate": 1e-05, "loss": 0.0, "num_tokens": 554399.0, "reward": 2.0625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.10731179267168045, "epoch": 0.051595383570943655, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "learning_rate": 1.0135135135135136e-05, "loss": -0.0, "num_tokens": 559603.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 333.125, "completions/mean_terminated_length": 333.125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.13763546478003263, "epoch": 0.05227427019687712, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "learning_rate": 1.027027027027027e-05, "loss": -0.0, "num_tokens": 566868.0, "reward": 2.0958333015441895, "reward_std": 0.36250340938568115, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.4225771427154541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5958333611488342, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12141691148281097, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.12349202670156956, "epoch": 0.05295315682281059, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "learning_rate": 1.0405405405405407e-05, "loss": -0.0, "num_tokens": 571157.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 161.0, "completions/mean_terminated_length": 161.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.13015724625438452, "epoch": 0.05363204344874406, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0540540540540541e-05, "loss": 0.0, "num_tokens": 575677.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.11832852475345135, "epoch": 0.05431093007467753, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "learning_rate": 1.0675675675675677e-05, "loss": 0.0, "num_tokens": 580384.0, "reward": 1.212499976158142, "reward_std": 0.760521411895752, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4625000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3159452974796295, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.10070886136963964, "epoch": 0.054989816700611, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "learning_rate": 1.0810810810810812e-05, "loss": 0.0, "num_tokens": 585981.0, "reward": 0.9375, "reward_std": 0.6662945747375488, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12400397658348083, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.12286796048283577, "epoch": 0.055668703326544465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0945945945945946e-05, "loss": 0.0, "num_tokens": 592643.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.1339035602286458, "epoch": 0.05634758995247793, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "learning_rate": 1.1081081081081081e-05, "loss": -0.0, "num_tokens": 597249.0, "reward": 1.693750023841858, "reward_std": 1.0695621967315674, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3187499940395355, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36747071146965027, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 365.0, "completions/mean_terminated_length": 365.0, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.13325223326683044, "epoch": 0.05702647657841141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1216216216216219e-05, "loss": 0.0, "num_tokens": 604713.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 428.125, "completions/mean_terminated_length": 428.125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.07309685787186027, "epoch": 0.05770536320434488, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "learning_rate": 1.1351351351351352e-05, "loss": -0.0, "num_tokens": 614834.0, "reward": 1.8297618627548218, "reward_std": 0.7953521013259888, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.22587695717811584, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25833332538604736, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17861904203891754, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.053572315722703934, "epoch": 0.058384249830278345, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "learning_rate": 1.1486486486486488e-05, "loss": 0.0, "num_tokens": 621619.0, "reward": 2.909090995788574, "reward_std": 0.2571297883987427, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 151.25, "completions/mean_terminated_length": 151.25, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.23708575032651424, "epoch": 0.059063136456211814, "frac_reward_zero_std": 0.0, "grad_norm": 3.984375, "learning_rate": 1.1621621621621622e-05, "loss": 0.0, "num_tokens": 625925.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 341.125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.146900518797338, "epoch": 0.05974202308214528, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "learning_rate": 1.1756756756756757e-05, "loss": 0.0, "num_tokens": 632598.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.27483534440398216, "epoch": 0.06042090970807875, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "learning_rate": 1.1891891891891894e-05, "loss": -0.0, "num_tokens": 637435.0, "reward": 1.7999999523162842, "reward_std": 0.6553807258605957, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17500001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24348658323287964, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.05631826026365161, "epoch": 0.06109979633401222, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "learning_rate": 1.2027027027027028e-05, "loss": -0.0, "num_tokens": 644637.0, "reward": 2.5568182468414307, "reward_std": 0.21115268766880035, "rewards/fixed_code_pass_all_test_reward/mean": 0.9318181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.19284729659557343, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.11315709352493286, "epoch": 0.06177868295994569, "frac_reward_zero_std": 0.0, "grad_norm": 3.796875, "learning_rate": 1.2162162162162164e-05, "loss": -0.0, "num_tokens": 649251.0, "reward": 0.8999999761581421, "reward_std": 0.37032803893089294, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.09401944745332003, "epoch": 0.062457569585879155, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "learning_rate": 1.2297297297297299e-05, "loss": -0.0, "num_tokens": 656479.0, "reward": 1.3214285373687744, "reward_std": 0.44361361861228943, "rewards/fixed_code_pass_all_test_reward/mean": 0.3214285671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.44361358880996704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.0514632654376328, "epoch": 0.06313645621181263, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "learning_rate": 1.2432432432432433e-05, "loss": 0.0, "num_tokens": 663305.0, "reward": 2.261805534362793, "reward_std": 0.15215781331062317, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.26180553436279297, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15215782821178436, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.14608134049922228, "epoch": 0.06381534283774609, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2567567567567568e-05, "loss": 0.0, "num_tokens": 668590.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.097068106289953, "epoch": 0.06449422946367957, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "learning_rate": 1.2702702702702702e-05, "loss": 0.0, "num_tokens": 675053.0, "reward": 2.0875000953674316, "reward_std": 0.31038394570350647, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.24306972324848175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3375000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12368355691432953, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07860143668949604, "epoch": 0.06517311608961303, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "learning_rate": 1.283783783783784e-05, "loss": -0.0, "num_tokens": 679438.0, "reward": 2.0333333015441895, "reward_std": 0.5294501185417175, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.28333336114883423, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3075762987136841, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.062403345480561256, "epoch": 0.0658520027155465, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "learning_rate": 1.2972972972972975e-05, "loss": 0.0, "num_tokens": 685547.0, "reward": 2.9000000953674316, "reward_std": 0.15118584036827087, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8999999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1511857807636261, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 317.375, "completions/mean_terminated_length": 317.375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.11283600656315684, "epoch": 0.06653088934147998, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "learning_rate": 1.3108108108108109e-05, "loss": -0.0, "num_tokens": 693230.0, "reward": 1.5511904954910278, "reward_std": 0.6588498950004578, "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.1428571492433548, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3369047939777374, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24302050471305847, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 298.375, "completions/mean_terminated_length": 298.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.14692620281130075, "epoch": 0.06720977596741344, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "learning_rate": 1.3243243243243244e-05, "loss": 0.0, "num_tokens": 698977.0, "reward": 1.765625, "reward_std": 0.4745180904865265, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.015625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 207.5, "completions/mean_terminated_length": 207.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.1271934425458312, "epoch": 0.06788866259334692, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "learning_rate": 1.3378378378378381e-05, "loss": -0.0, "num_tokens": 703989.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.09632621146738529, "epoch": 0.06856754921928038, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "learning_rate": 1.3513513513513515e-05, "loss": -0.0, "num_tokens": 708986.0, "reward": 1.5499999523162842, "reward_std": 0.4869731366634369, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.11827277857810259, "epoch": 0.06924643584521385, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "learning_rate": 1.364864864864865e-05, "loss": 0.0, "num_tokens": 713597.0, "reward": 1.6041667461395264, "reward_std": 0.6542748212814331, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1041666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15268756449222565, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.1445675678551197, "epoch": 0.06992532247114731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3783783783783784e-05, "loss": 0.0, "num_tokens": 718809.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.15129757719114423, "epoch": 0.07060420909708079, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "learning_rate": 1.391891891891892e-05, "loss": 0.0, "num_tokens": 725501.0, "reward": 1.5, "reward_std": 0.3598114550113678, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 130.125, "completions/mean_terminated_length": 130.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.0953492745757103, "epoch": 0.07128309572301425, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "learning_rate": 1.4054054054054055e-05, "loss": 0.0, "num_tokens": 729806.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.09533433010801673, "epoch": 0.07196198234894773, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "learning_rate": 1.4189189189189189e-05, "loss": -0.0, "num_tokens": 735012.0, "reward": 2.625, "reward_std": 0.4940117299556732, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2182178944349289, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 186.0, "completions/mean_terminated_length": 186.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.092557767406106, "epoch": 0.0726408689748812, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "learning_rate": 1.4324324324324326e-05, "loss": 0.0, "num_tokens": 740492.0, "reward": 2.107142925262451, "reward_std": 0.5918954014778137, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2892663460224867, "epoch": 0.07331975560081466, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "learning_rate": 1.4459459459459462e-05, "loss": 0.0, "num_tokens": 745470.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.12156647210940719, "epoch": 0.07399864222674814, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "learning_rate": 1.4594594594594596e-05, "loss": 0.0, "num_tokens": 751463.0, "reward": 1.6375000476837158, "reward_std": 0.19955304265022278, "rewards/fixed_code_pass_all_test_reward/mean": 0.5750000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.070710688829422, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.11055296938866377, "epoch": 0.0746775288526816, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "learning_rate": 1.4729729729729731e-05, "loss": -0.0, "num_tokens": 756897.0, "reward": 1.524999976158142, "reward_std": 0.7778174877166748, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.038223384879529476, "epoch": 0.07535641547861507, "frac_reward_zero_std": 0.0, "grad_norm": 0.578125, "learning_rate": 1.4864864864864865e-05, "loss": 0.0, "num_tokens": 763395.0, "reward": 2.5416667461395264, "reward_std": 0.19416078925132751, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5416666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19416078925132751, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.1549054579809308, "epoch": 0.07603530210454854, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "learning_rate": 1.5000000000000002e-05, "loss": -0.0, "num_tokens": 769027.0, "reward": 1.4583332538604736, "reward_std": 0.8486683964729309, "rewards/fixed_code_pass_all_test_reward/mean": 0.8541666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.3500283360481262, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1041666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19795581698417664, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 170.125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.1332483682781458, "epoch": 0.07671418873048201, "frac_reward_zero_std": 0.0, "grad_norm": 5.59375, "learning_rate": 1.5135135135135138e-05, "loss": 0.0, "num_tokens": 774076.0, "reward": 1.4375, "reward_std": 1.1160356998443604, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1638177689164877, "epoch": 0.07739307535641547, "frac_reward_zero_std": 0.0, "grad_norm": 3.875, "learning_rate": 1.527027027027027e-05, "loss": -0.0, "num_tokens": 778198.0, "reward": 2.049999952316284, "reward_std": 0.9365590214729309, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.550000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.49856939911842346, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 121.875, "completions/mean_terminated_length": 121.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.219361437484622, "epoch": 0.07807196198234895, "frac_reward_zero_std": 0.0, "grad_norm": 4.40625, "learning_rate": 1.540540540540541e-05, "loss": 0.0, "num_tokens": 782253.0, "reward": 1.3854167461395264, "reward_std": 0.42007532715797424, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1354166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19889327883720398, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.08035129262134433, "epoch": 0.07875084860828242, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "learning_rate": 1.554054054054054e-05, "loss": -0.0, "num_tokens": 788909.0, "reward": 2.2041666507720947, "reward_std": 0.5428284406661987, "rewards/fixed_code_pass_all_test_reward/mean": 0.6499999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.38172540068626404, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5541666746139526, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2598457932472229, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.0929252477362752, "epoch": 0.07942973523421588, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "learning_rate": 1.5675675675675676e-05, "loss": 0.0, "num_tokens": 794186.0, "reward": 1.0916666984558105, "reward_std": 0.17066630721092224, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17066630721092224, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.1819993769749999, "epoch": 0.08010862186014936, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "learning_rate": 1.581081081081081e-05, "loss": -0.0, "num_tokens": 799878.0, "reward": 1.087499976158142, "reward_std": 0.18077214062213898, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 304.5, "completions/mean_terminated_length": 304.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.058641504030674696, "epoch": 0.08078750848608282, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "learning_rate": 1.5945945945945947e-05, "loss": -0.0, "num_tokens": 806706.0, "reward": 2.5645833015441895, "reward_std": 0.34161585569381714, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5645833015441895, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34161585569381714, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.11894779466092587, "epoch": 0.0814663951120163, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "learning_rate": 1.6081081081081083e-05, "loss": 0.0, "num_tokens": 812551.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07181581668555737, "epoch": 0.08214528173794976, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "learning_rate": 1.6216216216216218e-05, "loss": -0.0, "num_tokens": 817517.0, "reward": 2.3249998092651367, "reward_std": 0.30472469329833984, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.32500001788139343, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.30472469329833984, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.13512822892516851, "epoch": 0.08282416836388323, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "learning_rate": 1.6351351351351354e-05, "loss": -0.0, "num_tokens": 823210.0, "reward": 1.0708333253860474, "reward_std": 0.098299041390419, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07083333283662796, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09829902648925781, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.08480434771627188, "epoch": 0.0835030549898167, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "learning_rate": 1.648648648648649e-05, "loss": 0.0, "num_tokens": 828285.0, "reward": 2.1750001907348633, "reward_std": 0.594084620475769, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.800000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2916836738586426, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 341.0, "completions/mean_terminated_length": 341.0, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.12959323124960065, "epoch": 0.08418194161575017, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.662162162162162e-05, "loss": 0.0, "num_tokens": 835541.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.06366240326315165, "epoch": 0.08486082824168364, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "learning_rate": 1.6756756756756757e-05, "loss": -0.0, "num_tokens": 841922.0, "reward": 2.2395832538604736, "reward_std": 0.1632840782403946, "rewards/fixed_code_pass_all_test_reward/mean": 0.3645833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.1886538565158844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 440.125, "completions/mean_terminated_length": 440.125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "entropy": 0.10514781018719077, "epoch": 0.0855397148676171, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "learning_rate": 1.6891891891891896e-05, "loss": 0.0, "num_tokens": 850291.0, "reward": 1.53125, "reward_std": 0.3750595152378082, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.53125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3750595450401306, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1252982271835208, "epoch": 0.08621860149355058, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "learning_rate": 1.7027027027027028e-05, "loss": 0.0, "num_tokens": 854765.0, "reward": 1.15625, "reward_std": 0.6935609579086304, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.06566974520683289, "epoch": 0.08689748811948404, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "learning_rate": 1.7162162162162163e-05, "loss": -0.0, "num_tokens": 861299.0, "reward": 1.5520832538604736, "reward_std": 0.24372075498104095, "rewards/fixed_code_pass_all_test_reward/mean": 0.4895833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.17501415312290192, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12400397658348083, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 543.375, "completions/mean_terminated_length": 543.375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 0.12522254325449467, "epoch": 0.08757637474541752, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "learning_rate": 1.72972972972973e-05, "loss": -0.0, "num_tokens": 871966.0, "reward": 2.1458332538604736, "reward_std": 0.3431587517261505, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.2070196568965912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3958333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3666396141052246, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.163420214317739, "epoch": 0.08825526137135098, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "learning_rate": 1.7432432432432434e-05, "loss": -0.0, "num_tokens": 877939.0, "reward": 1.9854166507720947, "reward_std": 0.4245854616165161, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3604166507720947, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29866600036621094, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 267.0, "completions/mean_terminated_length": 267.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.024705966003239155, "epoch": 0.08893414799728445, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "learning_rate": 1.756756756756757e-05, "loss": -0.0, "num_tokens": 884483.0, "reward": 2.8291666507720947, "reward_std": 0.3675044775009155, "rewards/fixed_code_pass_all_test_reward/mean": 0.9125000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2474873811006546, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15430334210395813, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 420.25, "completions/mean_terminated_length": 420.25, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.1089058571960777, "epoch": 0.08961303462321792, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "learning_rate": 1.7702702702702702e-05, "loss": 0.0, "num_tokens": 893029.0, "reward": 2.1129465103149414, "reward_std": 0.19129861891269684, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.48794645071029663, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11027258634567261, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 498.5, "completions/mean_terminated_length": 498.5, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.28384515829384327, "epoch": 0.09029192124915139, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "learning_rate": 1.783783783783784e-05, "loss": -0.0, "num_tokens": 900705.0, "reward": 0.8999999761581421, "reward_std": 0.37032803893089294, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.04758477327413857, "epoch": 0.09097080787508487, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "learning_rate": 1.7972972972972976e-05, "loss": 0.0, "num_tokens": 907077.0, "reward": 2.3958334922790527, "reward_std": 1.0155048370361328, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6458333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4027435779571533, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 259.875, "completions/mean_terminated_length": 259.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.04968078434467316, "epoch": 0.09164969450101833, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "learning_rate": 1.8108108108108108e-05, "loss": -0.0, "num_tokens": 914364.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.15637117624282837, "epoch": 0.0923285811269518, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "learning_rate": 1.8243243243243244e-05, "loss": 0.0, "num_tokens": 918713.0, "reward": 1.4208333492279053, "reward_std": 0.6153944730758667, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2675209641456604, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.12832927331328392, "epoch": 0.09300746775288526, "frac_reward_zero_std": 0.0, "grad_norm": 3.671875, "learning_rate": 1.8378378378378383e-05, "loss": -0.0, "num_tokens": 923279.0, "reward": 2.1472220420837402, "reward_std": 0.627296507358551, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6472222208976746, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38607826828956604, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06418356252834201, "epoch": 0.09368635437881874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8513513513513515e-05, "loss": 0.0, "num_tokens": 928250.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 105.375, "completions/mean_terminated_length": 105.375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.15013590827584267, "epoch": 0.0943652410047522, "frac_reward_zero_std": 0.0, "grad_norm": 5.28125, "learning_rate": 1.864864864864865e-05, "loss": -0.0, "num_tokens": 932285.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.08057058416306973, "epoch": 0.09504412763068568, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "learning_rate": 1.8783783783783786e-05, "loss": 0.0, "num_tokens": 937035.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 349.25, "completions/mean_terminated_length": 106.5714340209961, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.07423369376920164, "epoch": 0.09572301425661914, "frac_reward_zero_std": 0.0, "grad_norm": 5.84375, "learning_rate": 1.891891891891892e-05, "loss": 0.0, "num_tokens": 943061.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 138.375, "completions/mean_terminated_length": 138.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.062284584157168865, "epoch": 0.09640190088255261, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "learning_rate": 1.9054054054054057e-05, "loss": -0.0, "num_tokens": 948064.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 100.75, "completions/mean_terminated_length": 100.75, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.07618905929848552, "epoch": 0.09708078750848609, "frac_reward_zero_std": 0.0, "grad_norm": 3.6875, "learning_rate": 1.918918918918919e-05, "loss": -0.0, "num_tokens": 952110.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07020694715902209, "epoch": 0.09775967413441955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9324324324324328e-05, "loss": 0.0, "num_tokens": 958381.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 88.25, "completions/mean_terminated_length": 88.25, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.07306768978014588, "epoch": 0.09843856076035302, "frac_reward_zero_std": 0.0, "grad_norm": 5.375, "learning_rate": 1.9459459459459463e-05, "loss": 0.0, "num_tokens": 962151.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06474465597420931, "epoch": 0.09911744738628649, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "learning_rate": 1.9594594594594595e-05, "loss": -0.0, "num_tokens": 966740.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 67.5, "completions/mean_terminated_length": 67.5, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.06335726520046592, "epoch": 0.09979633401221996, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.972972972972973e-05, "loss": 0.0, "num_tokens": 970304.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 85.5, "completions/mean_terminated_length": 85.5, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.14347989484667778, "epoch": 0.10047522063815342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9864864864864866e-05, "loss": 0.0, "num_tokens": 974108.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.062342855613678694, "epoch": 0.1011541072640869, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "learning_rate": 2e-05, "loss": 0.0, "num_tokens": 979442.0, "reward": 2.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.03864629892632365, "epoch": 0.10183299389002037, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "learning_rate": 1.999997189149227e-05, "loss": 0.0, "num_tokens": 987228.0, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.03964217263273895, "epoch": 0.10251188051595383, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "learning_rate": 1.999988756612709e-05, "loss": 0.0, "num_tokens": 995402.0, "reward": 2.15625, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.0510701450984925, "epoch": 0.10319076714188731, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "learning_rate": 1.9999747024378516e-05, "loss": -0.0, "num_tokens": 1000416.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 165.375, "completions/mean_terminated_length": 165.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.0711862719617784, "epoch": 0.10386965376782077, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "learning_rate": 1.9999550267036634e-05, "loss": 0.0, "num_tokens": 1005595.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 114.625, "completions/mean_terminated_length": 114.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06987747130915523, "epoch": 0.10454854039375425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.999929729520755e-05, "loss": 0.0, "num_tokens": 1009872.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 89.625, "completions/mean_terminated_length": 89.625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.03780437004752457, "epoch": 0.10522742701968771, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "learning_rate": 1.99989881103134e-05, "loss": 0.0, "num_tokens": 1013853.0, "reward": 2.9583334922790527, "reward_std": 0.11785107105970383, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.006107321009039879, "epoch": 0.10590631364562118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9998622714092328e-05, "loss": 0.0, "num_tokens": 1020293.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 69.625, "completions/mean_terminated_length": 69.625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.07531640492379665, "epoch": 0.10658520027155464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9998201108598477e-05, "loss": 0.0, "num_tokens": 1023946.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.03529926063492894, "epoch": 0.10726408689748812, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "learning_rate": 1.9997723296201997e-05, "loss": -0.0, "num_tokens": 1029738.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 294.75, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.027123197447508574, "epoch": 0.1079429735234216, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "learning_rate": 1.9997189279589003e-05, "loss": 0.0, "num_tokens": 1037408.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.042470349464565516, "epoch": 0.10862186014935506, "frac_reward_zero_std": 0.0, "grad_norm": 3.78125, "learning_rate": 1.9996599061761575e-05, "loss": 0.0, "num_tokens": 1042280.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 102.125, "completions/mean_terminated_length": 102.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.04984563961625099, "epoch": 0.10930074677528853, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "learning_rate": 1.9995952646037743e-05, "loss": 0.0, "num_tokens": 1046441.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.06849029986187816, "epoch": 0.109979633401222, "frac_reward_zero_std": 0.0, "grad_norm": 3.5625, "learning_rate": 1.9995250036051462e-05, "loss": 0.0, "num_tokens": 1050675.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.024978973204270005, "epoch": 0.11065852002715547, "frac_reward_zero_std": 0.0, "grad_norm": 3.078125, "learning_rate": 1.9994491235752595e-05, "loss": -0.0, "num_tokens": 1056510.0, "reward": 2.8214285373687744, "reward_std": 0.3306500315666199, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.33065006136894226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.03540743584744632, "epoch": 0.11133740665308893, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9993676249406895e-05, "loss": 0.0, "num_tokens": 1060901.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 101.0, "completions/mean_terminated_length": 101.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.07029630057513714, "epoch": 0.1120162932790224, "frac_reward_zero_std": 0.0, "grad_norm": 5.875, "learning_rate": 1.999280508159597e-05, "loss": 0.0, "num_tokens": 1065045.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 114.25, "completions/mean_terminated_length": 114.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.04848286882042885, "epoch": 0.11269517990495587, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "learning_rate": 1.999187773721726e-05, "loss": 0.0, "num_tokens": 1069423.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 73.5, "completions/mean_terminated_length": 73.5, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.0937038529664278, "epoch": 0.11337406653088934, "frac_reward_zero_std": 0.0, "grad_norm": 4.09375, "learning_rate": 1.9990894221484027e-05, "loss": -0.0, "num_tokens": 1073107.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.04907804913818836, "epoch": 0.11405295315682282, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "learning_rate": 1.9989854539925296e-05, "loss": 0.0, "num_tokens": 1077942.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.01483443018514663, "epoch": 0.11473183978275628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9988758698385854e-05, "loss": 0.0, "num_tokens": 1084267.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 109.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.06854783603921533, "epoch": 0.11541072640868975, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "learning_rate": 1.9987606703026187e-05, "loss": 0.0, "num_tokens": 1088565.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 92.625, "completions/mean_terminated_length": 92.625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.03099416079930961, "epoch": 0.11608961303462322, "frac_reward_zero_std": 0.0, "grad_norm": 3.78125, "learning_rate": 1.9986398560322476e-05, "loss": -0.0, "num_tokens": 1092578.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.02786101191304624, "epoch": 0.11676849966055669, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9985134277066533e-05, "loss": 0.0, "num_tokens": 1096779.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.02895908593200147, "epoch": 0.11744738628649015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.998381386036578e-05, "loss": 0.0, "num_tokens": 1102372.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 98.0, "completions/mean_terminated_length": 98.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.05383358057588339, "epoch": 0.11812627291242363, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "learning_rate": 1.9982437317643218e-05, "loss": -0.0, "num_tokens": 1106292.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.02336695638950914, "epoch": 0.11880515953835709, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "learning_rate": 1.9981004656637344e-05, "loss": -0.0, "num_tokens": 1113063.0, "reward": 1.5138888359069824, "reward_std": 0.2444263994693756, "rewards/fixed_code_pass_all_test_reward/mean": 0.5138888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.2444264143705368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.02944644633680582, "epoch": 0.11948404616429056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9979515885402156e-05, "loss": 0.0, "num_tokens": 1118135.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 113.25, "completions/mean_terminated_length": 113.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.0337059882003814, "epoch": 0.12016293279022404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9977971012307085e-05, "loss": 0.0, "num_tokens": 1122561.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.01251135824713856, "epoch": 0.1208418194161575, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "learning_rate": 1.9976370046036947e-05, "loss": -0.0, "num_tokens": 1128121.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.060274966061115265, "epoch": 0.12152070604209098, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "learning_rate": 1.9974712995591887e-05, "loss": -0.0, "num_tokens": 1132759.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 302.625, "completions/mean_terminated_length": 302.625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.026927365455776453, "epoch": 0.12219959266802444, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "learning_rate": 1.9972999870287357e-05, "loss": 0.0, "num_tokens": 1140436.0, "reward": 2.7395834922790527, "reward_std": 0.5408648252487183, "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.25173014402389526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.011356945033185184, "epoch": 0.12287847929395791, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "learning_rate": 1.997123067975404e-05, "loss": -0.0, "num_tokens": 1145509.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.03719848836772144, "epoch": 0.12355736591989137, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.996940543393778e-05, "loss": 0.0, "num_tokens": 1149813.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 72.375, "completions/mean_terminated_length": 72.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.028450862504541874, "epoch": 0.12423625254582485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9967524143099583e-05, "loss": 0.0, "num_tokens": 1153400.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.04874406987801194, "epoch": 0.12491513917175831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9965586817815494e-05, "loss": 0.0, "num_tokens": 1158354.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.06884664436802268, "epoch": 0.12559402579769177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9963593468976583e-05, "loss": 0.0, "num_tokens": 1161829.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 111.75, "completions/mean_terminated_length": 111.75, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.06350117921829224, "epoch": 0.12627291242362526, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "learning_rate": 1.9961544107788855e-05, "loss": 0.0, "num_tokens": 1165915.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 85.375, "completions/mean_terminated_length": 85.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.11480604950338602, "epoch": 0.12695179904955872, "frac_reward_zero_std": 0.0, "grad_norm": 4.59375, "learning_rate": 1.9959438745773216e-05, "loss": -0.0, "num_tokens": 1170070.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 115.375, "completions/mean_terminated_length": 115.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.04991080705076456, "epoch": 0.12763068567549218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9957277394765377e-05, "loss": 0.0, "num_tokens": 1174785.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 112.25, "completions/mean_terminated_length": 112.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.023421406280249357, "epoch": 0.12830957230142567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.995506006691581e-05, "loss": 0.0, "num_tokens": 1179555.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 78.625, "completions/mean_terminated_length": 78.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.0779186524450779, "epoch": 0.12898845892735913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9952786774689667e-05, "loss": 0.0, "num_tokens": 1183624.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.024235277203842998, "epoch": 0.1296673455532926, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "learning_rate": 1.9950457530866726e-05, "loss": -0.0, "num_tokens": 1188030.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 116.25, "completions/mean_terminated_length": 116.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.027389248367398977, "epoch": 0.13034623217922606, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "learning_rate": 1.9948072348541294e-05, "loss": -0.0, "num_tokens": 1192208.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 86.375, "completions/mean_terminated_length": 86.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.031602269038558006, "epoch": 0.13102511880515955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9945631241122158e-05, "loss": 0.0, "num_tokens": 1196147.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 76.125, "completions/mean_terminated_length": 76.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.08630556054413319, "epoch": 0.131704005431093, "frac_reward_zero_std": 0.0, "grad_norm": 4.46875, "learning_rate": 1.9943134222332493e-05, "loss": -0.0, "num_tokens": 1199948.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 217.75, "completions/mean_terminated_length": 217.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07903834339231253, "epoch": 0.13238289205702647, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "learning_rate": 1.994058130620979e-05, "loss": 0.0, "num_tokens": 1205962.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 82.375, "completions/mean_terminated_length": 82.375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.06530685583129525, "epoch": 0.13306177868295996, "frac_reward_zero_std": 0.0, "grad_norm": 4.03125, "learning_rate": 1.9937972507105793e-05, "loss": 0.0, "num_tokens": 1209837.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 103.125, "completions/mean_terminated_length": 103.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.046963199973106384, "epoch": 0.13374066530889342, "frac_reward_zero_std": 0.0, "grad_norm": 4.6875, "learning_rate": 1.993530783968638e-05, "loss": 0.0, "num_tokens": 1213982.0, "reward": 2.375, "reward_std": 1.0606601238250732, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.09216509386897087, "epoch": 0.13441955193482688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.993258731893152e-05, "loss": 0.0, "num_tokens": 1217970.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.010635877726599574, "epoch": 0.13509843856076034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.992981096013517e-05, "loss": 0.0, "num_tokens": 1223086.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 91.75, "completions/mean_terminated_length": 91.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.08575577288866043, "epoch": 0.13577732518669383, "frac_reward_zero_std": 0.0, "grad_norm": 5.125, "learning_rate": 1.9926978778905193e-05, "loss": 0.0, "num_tokens": 1226980.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 86.75, "completions/mean_terminated_length": 86.75, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.07871610764414072, "epoch": 0.1364562118126273, "frac_reward_zero_std": 0.0, "grad_norm": 6.34375, "learning_rate": 1.992409079116326e-05, "loss": 0.0, "num_tokens": 1230898.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 69.5, "completions/mean_terminated_length": 69.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.06334522133693099, "epoch": 0.13713509843856075, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "learning_rate": 1.9921147013144782e-05, "loss": 0.0, "num_tokens": 1234566.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.034989220555871725, "epoch": 0.13781398506449424, "frac_reward_zero_std": 0.0, "grad_norm": 3.5, "learning_rate": 1.9918147461398796e-05, "loss": -0.0, "num_tokens": 1240086.0, "reward": 1.9166666269302368, "reward_std": 0.5657789707183838, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.015031340532004833, "epoch": 0.1384928716904277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9915092152787888e-05, "loss": 0.0, "num_tokens": 1246264.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 73.125, "completions/mean_terminated_length": 73.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.05455721355974674, "epoch": 0.13917175831636117, "frac_reward_zero_std": 0.0, "grad_norm": 7.6875, "learning_rate": 1.991198110448809e-05, "loss": -0.0, "num_tokens": 1249881.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 106.5, "completions/mean_terminated_length": 106.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.038839657325297594, "epoch": 0.13985064494229463, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "learning_rate": 1.9908814333988794e-05, "loss": 0.0, "num_tokens": 1253877.0, "reward": 2.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 91.25, "completions/mean_terminated_length": 91.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.03963353531435132, "epoch": 0.14052953156822812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.990559185909263e-05, "loss": 0.0, "num_tokens": 1257879.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 68.25, "completions/mean_terminated_length": 68.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.026947764679789543, "epoch": 0.14120841819416158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9902313697915395e-05, "loss": 0.0, "num_tokens": 1261641.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.10003570653498173, "epoch": 0.14188730482009504, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "learning_rate": 1.9898979868885933e-05, "loss": -0.0, "num_tokens": 1265912.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.031031151534989476, "epoch": 0.1425661914460285, "frac_reward_zero_std": 0.0, "grad_norm": 3.578125, "learning_rate": 1.989559039074603e-05, "loss": 0.0, "num_tokens": 1271198.0, "reward": 2.921875, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 0.013214790145866573, "epoch": 0.143245078071962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.989214528255033e-05, "loss": 0.0, "num_tokens": 1278461.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 65.75, "completions/mean_terminated_length": 65.75, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.03855268331244588, "epoch": 0.14392396469789545, "frac_reward_zero_std": 0.0, "grad_norm": 13.6875, "learning_rate": 1.9888644563666194e-05, "loss": 0.0, "num_tokens": 1282075.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.042058383114635944, "epoch": 0.1446028513238289, "frac_reward_zero_std": 0.0, "grad_norm": 3.578125, "learning_rate": 1.9885088253773623e-05, "loss": 0.0, "num_tokens": 1287535.0, "reward": 2.0500001907348633, "reward_std": 0.5928140878677368, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.03320292220450938, "epoch": 0.1452817379497624, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "learning_rate": 1.988147637286513e-05, "loss": 0.0, "num_tokens": 1293028.0, "reward": 1.5416667461395264, "reward_std": 0.3959116041660309, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.0128429364413023, "epoch": 0.14596062457569586, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.987780894124563e-05, "loss": 0.0, "num_tokens": 1300641.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 416.625, "completions/mean_terminated_length": 416.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.03136526886373758, "epoch": 0.14663951120162932, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "learning_rate": 1.987408597953233e-05, "loss": 0.0, "num_tokens": 1310110.0, "reward": 2.8375000953674316, "reward_std": 0.2199837565422058, "rewards/fixed_code_pass_all_test_reward/mean": 0.8374999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.219983771443367, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 57.125, "completions/mean_terminated_length": 57.125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.02553011034615338, "epoch": 0.14731839782756279, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.987030750865461e-05, "loss": 0.0, "num_tokens": 1313607.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.04503213532734662, "epoch": 0.14799728445349628, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "learning_rate": 1.9866473549853904e-05, "loss": -0.0, "num_tokens": 1321249.0, "reward": 1.6964285373687744, "reward_std": 0.24669833481311798, "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.2466983050107956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 105.75, "completions/mean_terminated_length": 105.75, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.022715769009664655, "epoch": 0.14867617107942974, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9862584124683587e-05, "loss": 0.0, "num_tokens": 1325439.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.02056844183243811, "epoch": 0.1493550577053632, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "learning_rate": 1.9858639255008844e-05, "loss": -0.0, "num_tokens": 1332892.0, "reward": 2.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.03644117899239063, "epoch": 0.1500339443312967, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "learning_rate": 1.9854638963006552e-05, "loss": 0.0, "num_tokens": 1340240.0, "reward": 1.09375, "reward_std": 1.060133934020996, "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.02476617321372032, "epoch": 0.15071283095723015, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "learning_rate": 1.9850583271165166e-05, "loss": -0.0, "num_tokens": 1346332.0, "reward": 2.642857074737549, "reward_std": 0.38180169463157654, "rewards/fixed_code_pass_all_test_reward/mean": 0.6428571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.3818017840385437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 58.625, "completions/mean_terminated_length": 58.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.08807905670255423, "epoch": 0.1513917175831636, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "learning_rate": 1.9846472202284574e-05, "loss": -0.0, "num_tokens": 1350065.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.028961456147953868, "epoch": 0.15207060420909707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.984230577947597e-05, "loss": 0.0, "num_tokens": 1354232.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.03760439460165799, "epoch": 0.15274949083503056, "frac_reward_zero_std": 0.0, "grad_norm": 5.90625, "learning_rate": 1.9838084026161746e-05, "loss": 0.0, "num_tokens": 1358590.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.053790890611708164, "epoch": 0.15342837746096402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9833806966075343e-05, "loss": 0.0, "num_tokens": 1363497.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 94.5, "completions/mean_terminated_length": 94.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.0835356218740344, "epoch": 0.15410726408689748, "frac_reward_zero_std": 0.0, "grad_norm": 3.84375, "learning_rate": 1.9829474623261106e-05, "loss": 0.0, "num_tokens": 1367525.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 81.5, "completions/mean_terminated_length": 81.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.0343251540325582, "epoch": 0.15478615071283094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9825087022074182e-05, "loss": 0.0, "num_tokens": 1371329.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.026921316282823682, "epoch": 0.15546503733876443, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "learning_rate": 1.9820644187180354e-05, "loss": 0.0, "num_tokens": 1375861.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 89.875, "completions/mean_terminated_length": 89.875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.02502172254025936, "epoch": 0.1561439239646979, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.981614614355591e-05, "loss": 0.0, "num_tokens": 1379844.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 81.0, "completions/mean_terminated_length": 81.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.033004665514454246, "epoch": 0.15682281059063136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.981159291648751e-05, "loss": 0.0, "num_tokens": 1383700.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.029004631796851754, "epoch": 0.15750169721656485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9806984531572038e-05, "loss": 0.0, "num_tokens": 1388536.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 89.75, "completions/mean_terminated_length": 89.75, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.04491503653116524, "epoch": 0.1581805838424983, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "learning_rate": 1.9802321014716465e-05, "loss": -0.0, "num_tokens": 1392558.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 90.125, "completions/mean_terminated_length": 90.125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.050404710695147514, "epoch": 0.15885947046843177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9797602392137678e-05, "loss": 0.0, "num_tokens": 1396623.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "entropy": 0.0162679695058614, "epoch": 0.15953835709436523, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "learning_rate": 1.9792828690362377e-05, "loss": 0.0, "num_tokens": 1404604.0, "reward": 1.9659091234207153, "reward_std": 0.09642363339662552, "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.021264452021569014, "epoch": 0.16021724372029872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9787999936226877e-05, "loss": 0.0, "num_tokens": 1409600.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.012292444240301847, "epoch": 0.16089613034623218, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "learning_rate": 1.9783116156877008e-05, "loss": 0.0, "num_tokens": 1417518.0, "reward": 1.9772727489471436, "reward_std": 0.042082689702510834, "rewards/fixed_code_pass_all_test_reward/mean": 0.9772727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.04208271950483322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 83.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.031746637308970094, "epoch": 0.16157501697216564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9778177379767903e-05, "loss": 0.0, "num_tokens": 1421246.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.05210646474733949, "epoch": 0.16225390359809913, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "learning_rate": 1.9773183632663907e-05, "loss": -0.0, "num_tokens": 1426286.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.020215536933392286, "epoch": 0.1629327902240326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9768134943638367e-05, "loss": 0.0, "num_tokens": 1430328.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.0342337426263839, "epoch": 0.16361167684996605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9763031341073512e-05, "loss": 0.0, "num_tokens": 1434560.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.06160185765475035, "epoch": 0.16429056347589951, "frac_reward_zero_std": 0.0, "grad_norm": 4.125, "learning_rate": 1.9757872853660265e-05, "loss": -0.0, "num_tokens": 1440885.0, "reward": 2.21875, "reward_std": 0.6999680995941162, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.3764851689338684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 91.25, "completions/mean_terminated_length": 91.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.04878190439194441, "epoch": 0.164969450101833, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "learning_rate": 1.975265951039811e-05, "loss": -0.0, "num_tokens": 1444679.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 253.0, "completions/mean_terminated_length": 253.0, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.03578205150552094, "epoch": 0.16564833672776647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.97473913405949e-05, "loss": 0.0, "num_tokens": 1451223.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 89.5, "completions/mean_terminated_length": 89.5, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.05918336706236005, "epoch": 0.16632722335369993, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "learning_rate": 1.974206837386672e-05, "loss": -0.0, "num_tokens": 1455203.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 75.125, "completions/mean_terminated_length": 75.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.02153742383234203, "epoch": 0.1670061099796334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9736690640137696e-05, "loss": 0.0, "num_tokens": 1459156.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 115.5, "completions/mean_terminated_length": 115.5, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.06047332426533103, "epoch": 0.16768499660556688, "frac_reward_zero_std": 0.0, "grad_norm": 3.640625, "learning_rate": 1.9731258169639846e-05, "loss": -0.0, "num_tokens": 1463312.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.013830311829224229, "epoch": 0.16836388323150034, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "learning_rate": 1.9725770992912893e-05, "loss": 0.0, "num_tokens": 1469461.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.030970022082328796, "epoch": 0.1690427698574338, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.972022914080411e-05, "loss": 0.0, "num_tokens": 1473672.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 219.625, "completions/mean_terminated_length": 219.625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.024551305454224348, "epoch": 0.1697216564833673, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "learning_rate": 1.9714632644468135e-05, "loss": 0.0, "num_tokens": 1479845.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 98.875, "completions/mean_terminated_length": 98.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.045385925099253654, "epoch": 0.17040054310930075, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "learning_rate": 1.9708981535366797e-05, "loss": -0.0, "num_tokens": 1483668.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.047814551275223494, "epoch": 0.1710794297352342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.970327584526895e-05, "loss": 0.0, "num_tokens": 1490566.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.058026759419590235, "epoch": 0.17175831636116767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9697515606250276e-05, "loss": 0.0, "num_tokens": 1495028.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06939646881073713, "epoch": 0.17243720298710116, "frac_reward_zero_std": 0.0, "grad_norm": 7.15625, "learning_rate": 1.9691700850693126e-05, "loss": -0.0, "num_tokens": 1499180.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 83.75, "completions/mean_terminated_length": 83.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.037486160174012184, "epoch": 0.17311608961303462, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9685831611286312e-05, "loss": 0.0, "num_tokens": 1502938.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 85.5, "completions/mean_terminated_length": 85.5, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.08376667788252234, "epoch": 0.17379497623896809, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.967990792102495e-05, "loss": 0.0, "num_tokens": 1506678.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.03614409361034632, "epoch": 0.17447386286490157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9673929813210265e-05, "loss": 0.0, "num_tokens": 1512223.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.038370306603610516, "epoch": 0.17515274949083504, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "learning_rate": 1.9667897321449387e-05, "loss": -0.0, "num_tokens": 1520255.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 79.75, "completions/mean_terminated_length": 79.75, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.05382959032431245, "epoch": 0.1758316361167685, "frac_reward_zero_std": 0.0, "grad_norm": 7.5625, "learning_rate": 1.9661810479655184e-05, "loss": 0.0, "num_tokens": 1524165.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 128.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.035490254405885935, "epoch": 0.17651052274270196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9655669322046068e-05, "loss": 0.0, "num_tokens": 1528880.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 79.25, "completions/mean_terminated_length": 79.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.08525615930557251, "epoch": 0.17718940936863545, "frac_reward_zero_std": 0.0, "grad_norm": 6.28125, "learning_rate": 1.9649473883145792e-05, "loss": -0.0, "num_tokens": 1532602.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 90.5, "completions/mean_terminated_length": 90.5, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.0366119546815753, "epoch": 0.1778682959945689, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9643224197783265e-05, "loss": 0.0, "num_tokens": 1536630.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 100.0, "completions/mean_terminated_length": 100.0, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.01925523462705314, "epoch": 0.17854718262050237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9636920301092352e-05, "loss": 0.0, "num_tokens": 1540558.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 535.375, "completions/mean_terminated_length": 535.375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "entropy": 0.04267650982365012, "epoch": 0.17922606924643583, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "learning_rate": 1.9630562228511682e-05, "loss": -0.0, "num_tokens": 1551401.0, "reward": 2.1964285373687744, "reward_std": 0.5175492763519287, "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.2901442348957062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 438.375, "completions/mean_terminated_length": 438.375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.0744494921527803, "epoch": 0.17990495587236932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.962415001578444e-05, "loss": 0.0, "num_tokens": 1561252.0, "reward": 2.5999999046325684, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.072756327688694, "epoch": 0.18058384249830278, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9617683698958168e-05, "loss": 0.0, "num_tokens": 1565348.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.018114945152774453, "epoch": 0.18126272912423624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9611163314384574e-05, "loss": 0.0, "num_tokens": 1569550.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 108.125, "completions/mean_terminated_length": 108.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.045144837349653244, "epoch": 0.18194161575016973, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9604588898719314e-05, "loss": 0.0, "num_tokens": 1573519.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 78.125, "completions/mean_terminated_length": 78.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.09496352914720774, "epoch": 0.1826205023761032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9597960488921785e-05, "loss": 0.0, "num_tokens": 1577544.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 100.75, "completions/mean_terminated_length": 100.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.09399104584008455, "epoch": 0.18329938900203666, "frac_reward_zero_std": 0.0, "grad_norm": 5.875, "learning_rate": 1.9591278122254938e-05, "loss": 0.0, "num_tokens": 1581494.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 91.25, "completions/mean_terminated_length": 91.25, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.05996898515149951, "epoch": 0.18397827562797012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9584541836285035e-05, "loss": 0.0, "num_tokens": 1585352.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.03379065846092999, "epoch": 0.1846571622539036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.957775166888147e-05, "loss": 0.0, "num_tokens": 1592692.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 82.25, "completions/mean_terminated_length": 82.25, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.07197279017418623, "epoch": 0.18533604887983707, "frac_reward_zero_std": 0.0, "grad_norm": 5.25, "learning_rate": 1.957090765821654e-05, "loss": -0.0, "num_tokens": 1596366.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 209.875, "completions/mean_terminated_length": 209.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.044796328293159604, "epoch": 0.18601493550577053, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "learning_rate": 1.9564009842765225e-05, "loss": 0.0, "num_tokens": 1601597.0, "reward": 1.0, "reward_std": 1.0690449476242065, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 343.0, "completions/mean_terminated_length": 343.0, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.036600218852981925, "epoch": 0.18669382213170402, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "learning_rate": 1.955705826130499e-05, "loss": -0.0, "num_tokens": 1609925.0, "reward": 1.740384578704834, "reward_std": 0.31565922498703003, "rewards/fixed_code_pass_all_test_reward/mean": 0.7403846383094788, "rewards/fixed_code_pass_all_test_reward/std": 0.3156592547893524, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.02586209448054433, "epoch": 0.18737270875763748, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "learning_rate": 1.9550052952915545e-05, "loss": 0.0, "num_tokens": 1614544.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.03599099209532142, "epoch": 0.18805159538357094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9542993956978647e-05, "loss": 0.0, "num_tokens": 1619904.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.055113062262535095, "epoch": 0.1887304820095044, "frac_reward_zero_std": 0.0, "grad_norm": 4.71875, "learning_rate": 1.9535881313177864e-05, "loss": -0.0, "num_tokens": 1623712.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 89.625, "completions/mean_terminated_length": 89.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.03858232032507658, "epoch": 0.1894093686354379, "frac_reward_zero_std": 0.0, "grad_norm": 4.28125, "learning_rate": 1.9528715061498355e-05, "loss": 0.0, "num_tokens": 1627517.0, "reward": 1.625, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 454.0, "completions/mean_terminated_length": 454.0, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.028234433149918914, "epoch": 0.19008825526137135, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "learning_rate": 1.9521495242226648e-05, "loss": -0.0, "num_tokens": 1637149.0, "reward": 1.9375, "reward_std": 0.45650067925453186, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 101.875, "completions/mean_terminated_length": 101.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.03720711078494787, "epoch": 0.19076714188730481, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "learning_rate": 1.9514221895950416e-05, "loss": 0.0, "num_tokens": 1641228.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.05074124364182353, "epoch": 0.19144602851323828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.950689506355824e-05, "loss": 0.0, "num_tokens": 1645468.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.01687573315575719, "epoch": 0.19212491513917176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.949951478623938e-05, "loss": 0.0, "num_tokens": 1649664.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.05306277936324477, "epoch": 0.19280380176510523, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "learning_rate": 1.949208110548356e-05, "loss": 0.0, "num_tokens": 1654351.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.08211056794971228, "epoch": 0.1934826883910387, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "learning_rate": 1.948459406308071e-05, "loss": -0.0, "num_tokens": 1658692.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.12029063701629639, "epoch": 0.19416157501697218, "frac_reward_zero_std": 0.0, "grad_norm": 3.875, "learning_rate": 1.9477053701120746e-05, "loss": 0.0, "num_tokens": 1663641.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.057263683527708054, "epoch": 0.19484046164290564, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "learning_rate": 1.9469460061993336e-05, "loss": 0.0, "num_tokens": 1669239.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.07944484893232584, "epoch": 0.1955193482688391, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9461813188387652e-05, "loss": 0.0, "num_tokens": 1673623.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 269.125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.043128138640895486, "epoch": 0.19619823489477256, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "learning_rate": 1.9454113123292133e-05, "loss": 0.0, "num_tokens": 1679800.0, "reward": 2.3541667461395264, "reward_std": 0.2077372819185257, "rewards/fixed_code_pass_all_test_reward/mean": 0.3541666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2077372521162033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.08693261258304119, "epoch": 0.19687712152070605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9446359909994253e-05, "loss": 0.0, "num_tokens": 1684462.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 312.125, "completions/mean_terminated_length": 312.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.03681220579892397, "epoch": 0.1975560081466395, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "learning_rate": 1.9438553592080257e-05, "loss": 0.0, "num_tokens": 1692183.0, "reward": 1.9659091234207153, "reward_std": 0.09642363339662552, "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 204.25, "completions/mean_terminated_length": 204.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.08966750046238303, "epoch": 0.19823489477257297, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9430694213434936e-05, "loss": 0.0, "num_tokens": 1697593.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.04149002628400922, "epoch": 0.19891378139850646, "frac_reward_zero_std": 0.0, "grad_norm": 3.921875, "learning_rate": 1.942278181824137e-05, "loss": -0.0, "num_tokens": 1702565.0, "reward": 2.1875, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.06707863416522741, "epoch": 0.19959266802443992, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "learning_rate": 1.9414816450980686e-05, "loss": 0.0, "num_tokens": 1708775.0, "reward": 2.6875, "reward_std": 0.3204349875450134, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 106.375, "completions/mean_terminated_length": 106.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.04134741902817041, "epoch": 0.20027155465037338, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.94067981564318e-05, "loss": 0.0, "num_tokens": 1713010.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 240.375, "completions/mean_terminated_length": 240.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.05806305631995201, "epoch": 0.20095044127630685, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "learning_rate": 1.9398726979671174e-05, "loss": 0.0, "num_tokens": 1719349.0, "reward": 1.671875, "reward_std": 0.2106272429227829, "rewards/fixed_code_pass_all_test_reward/mean": 0.671875, "rewards/fixed_code_pass_all_test_reward/std": 0.2106272578239441, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 108.875, "completions/mean_terminated_length": 108.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.01956355758011341, "epoch": 0.20162932790224034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9390602966072548e-05, "loss": 0.0, "num_tokens": 1723460.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.04810475930571556, "epoch": 0.2023082145281738, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "learning_rate": 1.9382426161306712e-05, "loss": 0.0, "num_tokens": 1727700.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06472568772733212, "epoch": 0.20298710115410726, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "learning_rate": 1.9374196611341212e-05, "loss": -0.0, "num_tokens": 1732251.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.018159526865929365, "epoch": 0.20366598778004075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9365914362440125e-05, "loss": 0.0, "num_tokens": 1736583.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.04344137362204492, "epoch": 0.2043448744059742, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "learning_rate": 1.9357579461163783e-05, "loss": 0.0, "num_tokens": 1741899.0, "reward": 2.3214285373687744, "reward_std": 1.0209182500839233, "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.464481920003891, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.06356910709291697, "epoch": 0.20502376103190767, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "learning_rate": 1.9349191954368515e-05, "loss": 0.0, "num_tokens": 1747426.0, "reward": 2.2291667461395264, "reward_std": 0.4448782503604889, "rewards/fixed_code_pass_all_test_reward/mean": 0.8541666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.20773723721504211, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.08265692042186856, "epoch": 0.20570264765784113, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "learning_rate": 1.9340751889206378e-05, "loss": 0.0, "num_tokens": 1753352.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.03501248057000339, "epoch": 0.20638153428377462, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "learning_rate": 1.93322593131249e-05, "loss": 0.0, "num_tokens": 1760788.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 128.25, "completions/mean_terminated_length": 128.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06978957680985332, "epoch": 0.20706042090970808, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "learning_rate": 1.932371427386681e-05, "loss": 0.0, "num_tokens": 1765342.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07782715698704123, "epoch": 0.20773930753564154, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "learning_rate": 1.931511681946977e-05, "loss": -0.0, "num_tokens": 1769598.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.07246410427615047, "epoch": 0.208418194161575, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "learning_rate": 1.9306466998266102e-05, "loss": 0.0, "num_tokens": 1774440.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.05745515413582325, "epoch": 0.2090970807875085, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "learning_rate": 1.9297764858882516e-05, "loss": 0.0, "num_tokens": 1781257.0, "reward": 1.5357142686843872, "reward_std": 0.10101523250341415, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 260.75, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.017283402499742806, "epoch": 0.20977596741344195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9289010450239843e-05, "loss": 0.0, "num_tokens": 1787807.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 216.75, "completions/mean_terminated_length": 216.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.11775861494243145, "epoch": 0.21045485403937542, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "learning_rate": 1.928020382155276e-05, "loss": -0.0, "num_tokens": 1793205.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.12922955304384232, "epoch": 0.2111337406653089, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9271345022329502e-05, "loss": 0.0, "num_tokens": 1799521.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 307.5, "completions/mean_terminated_length": 307.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06560094049200416, "epoch": 0.21181262729124237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9262434102371596e-05, "loss": 0.0, "num_tokens": 1806829.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 223.125, "completions/mean_terminated_length": 223.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.06759773194789886, "epoch": 0.21249151391717583, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "learning_rate": 1.9253471111773572e-05, "loss": 0.0, "num_tokens": 1813294.0, "reward": 2.847222328186035, "reward_std": 0.25845497846603394, "rewards/fixed_code_pass_all_test_reward/mean": 0.8472222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.25845491886138916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.04212288232520223, "epoch": 0.2131704005431093, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.924445610092269e-05, "loss": 0.0, "num_tokens": 1817354.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 235.625, "completions/mean_terminated_length": 235.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.10384846292436123, "epoch": 0.21384928716904278, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "learning_rate": 1.9235389120498645e-05, "loss": -0.0, "num_tokens": 1822727.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 112.5, "completions/mean_terminated_length": 112.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.03845132002606988, "epoch": 0.21452817379497624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9226270221473302e-05, "loss": 0.0, "num_tokens": 1826731.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.03485893807373941, "epoch": 0.2152070604209097, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.921709945511039e-05, "loss": 0.0, "num_tokens": 1832460.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.1028092484921217, "epoch": 0.2158859470468432, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "learning_rate": 1.9207876872965217e-05, "loss": -0.0, "num_tokens": 1837816.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.15017211344093084, "epoch": 0.21656483367277665, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "learning_rate": 1.9198602526884388e-05, "loss": -0.0, "num_tokens": 1843215.0, "reward": 2.125, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 390.25, "completions/mean_terminated_length": 390.25, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.05205815797671676, "epoch": 0.2172437202987101, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "learning_rate": 1.9189276469005508e-05, "loss": 0.0, "num_tokens": 1852281.0, "reward": 2.012500047683716, "reward_std": 0.41209399700164795, "rewards/fixed_code_pass_all_test_reward/mean": 0.13750000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.07440238445997238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.0689094322733581, "epoch": 0.21792260692464357, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.917989875175689e-05, "loss": 0.0, "num_tokens": 1857238.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.04457589378580451, "epoch": 0.21860149355057706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9170469427857264e-05, "loss": 0.0, "num_tokens": 1862115.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 228.625, "completions/mean_terminated_length": 228.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.041729350574314594, "epoch": 0.21928038017651053, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "learning_rate": 1.9160988550315475e-05, "loss": -0.0, "num_tokens": 1868216.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.023223794298246503, "epoch": 0.219959266802444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9151456172430186e-05, "loss": 0.0, "num_tokens": 1873192.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.05698779132217169, "epoch": 0.22063815342837745, "frac_reward_zero_std": 0.0, "grad_norm": 3.703125, "learning_rate": 1.914187234778958e-05, "loss": 0.0, "num_tokens": 1878709.0, "reward": 1.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.12405252177268267, "epoch": 0.22131704005431094, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "learning_rate": 1.913223713027106e-05, "loss": -0.0, "num_tokens": 1884617.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.027563789626583457, "epoch": 0.2219959266802444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9122550574040942e-05, "loss": 0.0, "num_tokens": 1889411.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.05370080238208175, "epoch": 0.22267481330617786, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "learning_rate": 1.9112812733554155e-05, "loss": -0.0, "num_tokens": 1894163.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 210.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.09333140589296818, "epoch": 0.22335369993211135, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "learning_rate": 1.910302366355393e-05, "loss": -0.0, "num_tokens": 1898952.0, "reward": 2.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.05855082580819726, "epoch": 0.2240325865580448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.90931834190715e-05, "loss": 0.0, "num_tokens": 1904716.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.08473899774253368, "epoch": 0.22471147318397827, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "learning_rate": 1.9083292055425783e-05, "loss": -0.0, "num_tokens": 1909610.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 316.625, "completions/mean_terminated_length": 316.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.08378634741529822, "epoch": 0.22539035980991173, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "learning_rate": 1.907334962822307e-05, "loss": 0.0, "num_tokens": 1918079.0, "reward": 2.232142925262451, "reward_std": 0.1308750957250595, "rewards/fixed_code_pass_all_test_reward/mean": 0.2321428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.13087505102157593, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.06544020678848028, "epoch": 0.22606924643584522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.906335619335672e-05, "loss": 0.0, "num_tokens": 1922805.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.08480233186855912, "epoch": 0.22674813306177868, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9053311807006845e-05, "loss": 0.0, "num_tokens": 1927617.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.0539171420969069, "epoch": 0.22742701968771215, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.904321652563998e-05, "loss": 0.0, "num_tokens": 1932305.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.08433426916599274, "epoch": 0.22810590631364563, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "learning_rate": 1.903307040600879e-05, "loss": 0.0, "num_tokens": 1937396.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.08307742513716221, "epoch": 0.2287847929395791, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "learning_rate": 1.902287350515173e-05, "loss": -0.0, "num_tokens": 1942374.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.0349551672115922, "epoch": 0.22946367956551256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9012625880392733e-05, "loss": 0.0, "num_tokens": 1946682.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.09286962915211916, "epoch": 0.23014256619144602, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "learning_rate": 1.900232758934089e-05, "loss": 0.0, "num_tokens": 1951772.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.0401443219743669, "epoch": 0.2308214528173795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.899197868989011e-05, "loss": 0.0, "num_tokens": 1956906.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.10647087544202805, "epoch": 0.23150033944331297, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "learning_rate": 1.898157924021883e-05, "loss": -0.0, "num_tokens": 1961822.0, "reward": 2.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06849431153386831, "epoch": 0.23217922606924643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8971129298789644e-05, "loss": 0.0, "num_tokens": 1965987.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 465.0, "completions/mean_terminated_length": 465.0, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.1154327979311347, "epoch": 0.2328581126951799, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "learning_rate": 1.8960628924349006e-05, "loss": -0.0, "num_tokens": 1975067.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 216.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.04719087341800332, "epoch": 0.23353699932111338, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8950078175926886e-05, "loss": 0.0, "num_tokens": 1980787.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.0680823945440352, "epoch": 0.23421588594704684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8939477112836445e-05, "loss": 0.0, "num_tokens": 1985710.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.10352581646293402, "epoch": 0.2348947725729803, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "learning_rate": 1.892882579467369e-05, "loss": 0.0, "num_tokens": 1990741.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.08224901277571917, "epoch": 0.2355736591989138, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "learning_rate": 1.8918124281317162e-05, "loss": -0.0, "num_tokens": 1995244.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 405.375, "completions/mean_terminated_length": 405.375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.1158347800374031, "epoch": 0.23625254582484725, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "learning_rate": 1.8907372632927573e-05, "loss": -0.0, "num_tokens": 2003159.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.0793031700886786, "epoch": 0.23693143245078072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8896570909947477e-05, "loss": 0.0, "num_tokens": 2007883.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.127309899777174, "epoch": 0.23761031907671418, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "learning_rate": 1.8885719173100937e-05, "loss": -0.0, "num_tokens": 2012203.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 275.25, "completions/mean_terminated_length": 275.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.03309164522215724, "epoch": 0.23828920570264767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.887481748339318e-05, "loss": 0.0, "num_tokens": 2018637.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.05568704567849636, "epoch": 0.23896809232858113, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8863865902110253e-05, "loss": 0.0, "num_tokens": 2024805.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.09123982861638069, "epoch": 0.2396469789545146, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "learning_rate": 1.8852864490818678e-05, "loss": -0.0, "num_tokens": 2029361.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.08292779233306646, "epoch": 0.24032586558044808, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "learning_rate": 1.8841813311365105e-05, "loss": -0.0, "num_tokens": 2034330.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 243.375, "completions/mean_terminated_length": 243.375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.12956679053604603, "epoch": 0.24100475220638154, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "learning_rate": 1.8830712425875964e-05, "loss": -0.0, "num_tokens": 2041389.0, "reward": 2.0625, "reward_std": 0.4172614812850952, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.0687949163839221, "epoch": 0.241683638832315, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "learning_rate": 1.8819561896757124e-05, "loss": 0.0, "num_tokens": 2046137.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.082338429056108, "epoch": 0.24236252545824846, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "learning_rate": 1.8808361786693533e-05, "loss": -0.0, "num_tokens": 2051485.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.08644285053014755, "epoch": 0.24304141208418195, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "learning_rate": 1.879711215864886e-05, "loss": 0.0, "num_tokens": 2057730.0, "reward": 2.2083334922790527, "reward_std": 0.3053751587867737, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.3053751289844513, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 236.0, "completions/mean_terminated_length": 236.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.12417812552303076, "epoch": 0.2437202987101154, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "learning_rate": 1.8785813075865164e-05, "loss": 0.0, "num_tokens": 2063082.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 238.75, "completions/mean_terminated_length": 238.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.1339465482160449, "epoch": 0.24439918533604887, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "learning_rate": 1.877446460186251e-05, "loss": -0.0, "num_tokens": 2068248.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 207.125, "completions/mean_terminated_length": 207.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.04863814264535904, "epoch": 0.24507807196198234, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8763066800438638e-05, "loss": 0.0, "num_tokens": 2073585.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 235.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.04723559692502022, "epoch": 0.24575695858791582, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.875161973566858e-05, "loss": 0.0, "num_tokens": 2079495.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 377.75, "completions/mean_terminated_length": 377.75, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.07612746069207788, "epoch": 0.24643584521384929, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "learning_rate": 1.874012347190432e-05, "loss": -0.0, "num_tokens": 2088437.0, "reward": 2.1624999046325684, "reward_std": 0.07440241426229477, "rewards/fixed_code_pass_all_test_reward/mean": 0.16249999403953552, "rewards/fixed_code_pass_all_test_reward/std": 0.07440238445997238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.08486952725797892, "epoch": 0.24711473183978275, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "learning_rate": 1.8728578073774427e-05, "loss": 0.0, "num_tokens": 2093419.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.06508130906149745, "epoch": 0.24779361846571624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8716983606183673e-05, "loss": 0.0, "num_tokens": 2098511.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 367.25, "completions/mean_terminated_length": 367.25, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.06202725553885102, "epoch": 0.2484725050916497, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.87053401343127e-05, "loss": 0.0, "num_tokens": 2105721.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.07700490904971957, "epoch": 0.24915139171758316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8693647723617637e-05, "loss": 0.0, "num_tokens": 2110469.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 181.5, "completions/mean_terminated_length": 181.5, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.06152817793190479, "epoch": 0.24983027834351662, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "learning_rate": 1.8681906439829716e-05, "loss": -0.0, "num_tokens": 2115137.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 346.375, "completions/mean_terminated_length": 346.375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.041742518078535795, "epoch": 0.2505091649694501, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "learning_rate": 1.8670116348954945e-05, "loss": 0.0, "num_tokens": 2123500.0, "reward": 1.817307710647583, "reward_std": 0.12292228639125824, "rewards/fixed_code_pass_all_test_reward/mean": 0.817307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.12292228639125824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.13209902122616768, "epoch": 0.25118805159538354, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "learning_rate": 1.865827751727368e-05, "loss": -0.0, "num_tokens": 2129381.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.046899959444999695, "epoch": 0.25186693822131706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.864639001134031e-05, "loss": 0.0, "num_tokens": 2134507.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.0577500881627202, "epoch": 0.2525458248472505, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "learning_rate": 1.863445389798284e-05, "loss": 0.0, "num_tokens": 2139127.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.06450037378817797, "epoch": 0.253224711473184, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "learning_rate": 1.8622469244302542e-05, "loss": -0.0, "num_tokens": 2144848.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.05902037117630243, "epoch": 0.25390359809911744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8610436117673557e-05, "loss": 0.0, "num_tokens": 2149741.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.04024791670963168, "epoch": 0.2545824847250509, "frac_reward_zero_std": 0.0, "grad_norm": 4.28125, "learning_rate": 1.8598354585742537e-05, "loss": 0.0, "num_tokens": 2154013.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 231.375, "completions/mean_terminated_length": 231.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.043174607679247856, "epoch": 0.25526137135098437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.858622471642824e-05, "loss": 0.0, "num_tokens": 2159872.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.04491899721324444, "epoch": 0.25594025797691783, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8574046577921182e-05, "loss": 0.0, "num_tokens": 2164863.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 253.25, "completions/mean_terminated_length": 253.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.14451793301850557, "epoch": 0.25661914460285135, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "learning_rate": 1.8561820238683216e-05, "loss": 0.0, "num_tokens": 2170985.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 157.75, "completions/mean_terminated_length": 157.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.07329763192683458, "epoch": 0.2572980312287848, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "learning_rate": 1.8549545767447174e-05, "loss": -0.0, "num_tokens": 2175351.0, "reward": 2.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 119.5, "completions/mean_terminated_length": 119.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.04977906960994005, "epoch": 0.25797691785471827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.853722323321647e-05, "loss": 0.0, "num_tokens": 2179419.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.04489452810958028, "epoch": 0.25865580448065173, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8524852705264716e-05, "loss": 0.0, "num_tokens": 2183437.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.0470429384149611, "epoch": 0.2593346911065852, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "learning_rate": 1.8512434253135324e-05, "loss": 0.0, "num_tokens": 2189892.0, "reward": 2.6041667461395264, "reward_std": 0.17677675187587738, "rewards/fixed_code_pass_all_test_reward/mean": 0.6041666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.07873268146067858, "epoch": 0.26001357773251865, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8499967946641127e-05, "loss": 0.0, "num_tokens": 2194258.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.04758663475513458, "epoch": 0.2606924643584521, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "learning_rate": 1.848745385586398e-05, "loss": -0.0, "num_tokens": 2201957.0, "reward": 2.578125, "reward_std": 0.4952339828014374, "rewards/fixed_code_pass_all_test_reward/mean": 0.578125, "rewards/fixed_code_pass_all_test_reward/std": 0.4952339828014374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.09630326088517904, "epoch": 0.26137135098438563, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "learning_rate": 1.8474892051154366e-05, "loss": 0.0, "num_tokens": 2208077.0, "reward": 2.734375, "reward_std": 0.19408094882965088, "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, "rewards/fixed_code_pass_all_test_reward/std": 0.19408094882965088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 144.5, "completions/mean_terminated_length": 144.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.08041145419701934, "epoch": 0.2620502376103191, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "learning_rate": 1.8462282603131005e-05, "loss": 0.0, "num_tokens": 2212361.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.07909099664539099, "epoch": 0.26272912423625255, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "learning_rate": 1.8449625582680445e-05, "loss": 0.0, "num_tokens": 2217505.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 141.75, "completions/mean_terminated_length": 141.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06990398140624166, "epoch": 0.263408010862186, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "learning_rate": 1.843692106095668e-05, "loss": -0.0, "num_tokens": 2221783.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.08789980597794056, "epoch": 0.2640868974881195, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "learning_rate": 1.842416910938074e-05, "loss": -0.0, "num_tokens": 2226817.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06863742880523205, "epoch": 0.26476578411405294, "frac_reward_zero_std": 0.0, "grad_norm": 4.25, "learning_rate": 1.841136979964029e-05, "loss": 0.0, "num_tokens": 2232543.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.09061278309673071, "epoch": 0.2654446707399864, "frac_reward_zero_std": 0.0, "grad_norm": 3.40625, "learning_rate": 1.8398523203689235e-05, "loss": -0.0, "num_tokens": 2236616.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.11232329346239567, "epoch": 0.2661235573659199, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8385629393747292e-05, "loss": 0.0, "num_tokens": 2242499.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.09821562003344297, "epoch": 0.2668024439918534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.837268844229962e-05, "loss": 0.0, "num_tokens": 2247259.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.16587267909199, "epoch": 0.26748133061778684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8359700422096385e-05, "loss": 0.0, "num_tokens": 2251994.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.10131550021469593, "epoch": 0.2681602172437203, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "learning_rate": 1.8346665406152362e-05, "loss": 0.0, "num_tokens": 2256595.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.05954999150708318, "epoch": 0.26883910386965376, "frac_reward_zero_std": 0.0, "grad_norm": 4.8125, "learning_rate": 1.8333583467746515e-05, "loss": -0.0, "num_tokens": 2260897.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.08692301250994205, "epoch": 0.2695179904955872, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "learning_rate": 1.83204546804216e-05, "loss": 0.0, "num_tokens": 2266864.0, "reward": 2.7083334922790527, "reward_std": 0.11785111576318741, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 274.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.08521929755806923, "epoch": 0.2701968771215207, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "learning_rate": 1.8307279117983744e-05, "loss": 0.0, "num_tokens": 2273084.0, "reward": 2.8333334922790527, "reward_std": 0.17817412316799164, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.06568845454603434, "epoch": 0.2708757637474542, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "learning_rate": 1.829405685450202e-05, "loss": -0.0, "num_tokens": 2277674.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.04120262851938605, "epoch": 0.27155465037338766, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "learning_rate": 1.828078796430805e-05, "loss": -0.0, "num_tokens": 2284126.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.12331179715692997, "epoch": 0.2722335369993211, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "learning_rate": 1.826747252199558e-05, "loss": 0.0, "num_tokens": 2290021.0, "reward": 2.075000047683716, "reward_std": 0.8811518549919128, "rewards/fixed_code_pass_all_test_reward/mean": 0.574999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.345377653837204, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.06388229411095381, "epoch": 0.2729124236252546, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "learning_rate": 1.8254110602420047e-05, "loss": 0.0, "num_tokens": 2294217.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.07360357465222478, "epoch": 0.27359131025118805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8240702280698176e-05, "loss": 0.0, "num_tokens": 2299792.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.08301210403442383, "epoch": 0.2742701968771215, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "learning_rate": 1.822724763220755e-05, "loss": -0.0, "num_tokens": 2304288.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.07697890792042017, "epoch": 0.27494908350305497, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "learning_rate": 1.8213746732586186e-05, "loss": 0.0, "num_tokens": 2309394.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 304.5, "completions/mean_terminated_length": 304.5, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.08701994083821774, "epoch": 0.2756279701289885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8200199657732115e-05, "loss": 0.0, "num_tokens": 2316574.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 111.5, "completions/mean_terminated_length": 111.5, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.07085479702800512, "epoch": 0.27630685675492195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8186606483802945e-05, "loss": 0.0, "num_tokens": 2320498.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 135.875, "completions/mean_terminated_length": 135.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.05542711168527603, "epoch": 0.2769857433808554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.817296728721545e-05, "loss": 0.0, "num_tokens": 2324689.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 396.625, "completions/mean_terminated_length": 396.625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.1278435904532671, "epoch": 0.27766463000678887, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "learning_rate": 1.815928214464511e-05, "loss": -0.0, "num_tokens": 2333054.0, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 454.125, "completions/mean_terminated_length": 454.125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 0.09295008983463049, "epoch": 0.27834351663272233, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "learning_rate": 1.814555113302573e-05, "loss": -0.0, "num_tokens": 2342383.0, "reward": 1.908653736114502, "reward_std": 0.04079460725188255, "rewards/fixed_code_pass_all_test_reward/mean": 0.9086538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.04079463332891464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.07046977197751403, "epoch": 0.2790224032586558, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "learning_rate": 1.813177432954894e-05, "loss": -0.0, "num_tokens": 2346829.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.08017634134739637, "epoch": 0.27970128988458925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.811795181166383e-05, "loss": 0.0, "num_tokens": 2350949.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 144.875, "completions/mean_terminated_length": 144.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.08231574995443225, "epoch": 0.2803801765105227, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "learning_rate": 1.8104083657076466e-05, "loss": 0.0, "num_tokens": 2355612.0, "reward": 2.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 216.625, "completions/mean_terminated_length": 216.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.07155280746519566, "epoch": 0.28105906313645623, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "learning_rate": 1.8090169943749477e-05, "loss": -0.0, "num_tokens": 2361553.0, "reward": 1.7864582538604736, "reward_std": 0.374296635389328, "rewards/fixed_code_pass_all_test_reward/mean": 0.890625, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02083333395421505, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07857662159949541, "epoch": 0.2817379497623897, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "learning_rate": 1.80762107499016e-05, "loss": 0.0, "num_tokens": 2366180.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.11160101648420095, "epoch": 0.28241683638832316, "frac_reward_zero_std": 0.0, "grad_norm": 3.484375, "learning_rate": 1.8062206154007267e-05, "loss": 0.0, "num_tokens": 2370810.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 185.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.06576718902215362, "epoch": 0.2830957230142566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8048156234796124e-05, "loss": 0.0, "num_tokens": 2376312.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.03741574566811323, "epoch": 0.2837746096401901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8034061071252632e-05, "loss": 0.0, "num_tokens": 2380593.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 110.375, "completions/mean_terminated_length": 110.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.060843697283416986, "epoch": 0.28445349626612354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8019920742615596e-05, "loss": 0.0, "num_tokens": 2384572.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.0874741431325674, "epoch": 0.285132382892057, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8005735328377718e-05, "loss": 0.0, "num_tokens": 2389153.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.04866040498018265, "epoch": 0.2858112695179905, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "learning_rate": 1.7991504908285162e-05, "loss": 0.0, "num_tokens": 2393725.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 162.25, "completions/mean_terminated_length": 162.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.06404763692989945, "epoch": 0.286490156143924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7977229562337104e-05, "loss": 0.0, "num_tokens": 2398159.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.08717668987810612, "epoch": 0.28716904276985744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7962909370785283e-05, "loss": 0.0, "num_tokens": 2402366.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.09095563367009163, "epoch": 0.2878479293957909, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "learning_rate": 1.7948544414133534e-05, "loss": 0.0, "num_tokens": 2406966.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.05710427463054657, "epoch": 0.28852681602172436, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "learning_rate": 1.7934134773137364e-05, "loss": -0.0, "num_tokens": 2411339.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 101.75, "completions/mean_terminated_length": 101.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.01786201330833137, "epoch": 0.2892057026476578, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7919680528803468e-05, "loss": 0.0, "num_tokens": 2415289.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.030130391474813223, "epoch": 0.2898845892735913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7905181762389298e-05, "loss": 0.0, "num_tokens": 2421688.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.054072245955467224, "epoch": 0.2905634758995248, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "learning_rate": 1.7890638555402585e-05, "loss": 0.0, "num_tokens": 2426969.0, "reward": 2.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.08146559819579124, "epoch": 0.29124236252545826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7876050989600908e-05, "loss": 0.0, "num_tokens": 2431562.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07595767872408032, "epoch": 0.2919212491513917, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "learning_rate": 1.7861419146991204e-05, "loss": 0.0, "num_tokens": 2436515.0, "reward": 2.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.08013419527560472, "epoch": 0.2926001357773252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7846743109829318e-05, "loss": 0.0, "num_tokens": 2441118.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 141.375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.08432691264897585, "epoch": 0.29327902240325865, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "learning_rate": 1.7832022960619562e-05, "loss": -0.0, "num_tokens": 2445433.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 146.625, "completions/mean_terminated_length": 146.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.06273604743182659, "epoch": 0.2939579090291921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7817258782114216e-05, "loss": 0.0, "num_tokens": 2449958.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.14235163014382124, "epoch": 0.29463679565512557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7802450657313086e-05, "loss": 0.0, "num_tokens": 2454913.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.11650931928306818, "epoch": 0.2953156822810591, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7787598669463027e-05, "loss": 0.0, "num_tokens": 2461209.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.02227119216695428, "epoch": 0.29599456890699255, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.777270290205749e-05, "loss": 0.0, "num_tokens": 2465538.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 185.375, "completions/mean_terminated_length": 185.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.08627740852534771, "epoch": 0.296673455532926, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "learning_rate": 1.7757763438836027e-05, "loss": -0.0, "num_tokens": 2471117.0, "reward": 2.7124998569488525, "reward_std": 0.16420802474021912, "rewards/fixed_code_pass_all_test_reward/mean": 0.7125000357627869, "rewards/fixed_code_pass_all_test_reward/std": 0.1642080694437027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 197.375, "completions/mean_terminated_length": 197.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.08697156794369221, "epoch": 0.2973523421588595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7742780363783843e-05, "loss": 0.0, "num_tokens": 2476168.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.09812935255467892, "epoch": 0.29803122878479293, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7727753761131312e-05, "loss": 0.0, "num_tokens": 2480534.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 412.875, "completions/mean_terminated_length": 412.875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.04836875991895795, "epoch": 0.2987101154107264, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "learning_rate": 1.7712683715353514e-05, "loss": -0.0, "num_tokens": 2490005.0, "reward": 2.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06183534534648061, "epoch": 0.29938900203665986, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "learning_rate": 1.7697570311169746e-05, "loss": 0.0, "num_tokens": 2495197.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.05814493494108319, "epoch": 0.3000678886625934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7682413633543057e-05, "loss": 0.0, "num_tokens": 2502815.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07629174273461103, "epoch": 0.30074677528852684, "frac_reward_zero_std": 0.0, "grad_norm": 3.296875, "learning_rate": 1.766721376767976e-05, "loss": 0.0, "num_tokens": 2507226.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.09764100052416325, "epoch": 0.3014256619144603, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "learning_rate": 1.7651970799028976e-05, "loss": 0.0, "num_tokens": 2512764.0, "reward": 2.1785714626312256, "reward_std": 0.4691658616065979, "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.07636035233736038, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.060885429847985506, "epoch": 0.30210454854039376, "frac_reward_zero_std": 0.0, "grad_norm": 3.8125, "learning_rate": 1.7636684813282113e-05, "loss": 0.0, "num_tokens": 2518005.0, "reward": 2.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.12670390959829092, "epoch": 0.3027834351663272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7621355896372424e-05, "loss": 0.0, "num_tokens": 2524087.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.05900596734136343, "epoch": 0.3034623217922607, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "learning_rate": 1.760598413447451e-05, "loss": 0.0, "num_tokens": 2528757.0, "reward": 2.200000047683716, "reward_std": 0.46598589420318604, "rewards/fixed_code_pass_all_test_reward/mean": 0.699999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.32071349024772644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.03903109743259847, "epoch": 0.30414120841819414, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7590569614003825e-05, "loss": 0.0, "num_tokens": 2534125.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.05271232035011053, "epoch": 0.3048200950441276, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "learning_rate": 1.7575112421616203e-05, "loss": 0.0, "num_tokens": 2538617.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 129.625, "completions/mean_terminated_length": 129.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.0384956831112504, "epoch": 0.3054989816700611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7559612644207364e-05, "loss": 0.0, "num_tokens": 2542782.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.03622567281126976, "epoch": 0.3061778682959946, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "learning_rate": 1.7544070368912435e-05, "loss": -0.0, "num_tokens": 2548879.0, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.06469365209341049, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 388.25, "completions/mean_terminated_length": 388.25, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.054282717406749725, "epoch": 0.30685675492192804, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "learning_rate": 1.7528485683105444e-05, "loss": 0.0, "num_tokens": 2557273.0, "reward": 2.78125, "reward_std": 0.41052013635635376, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.0631052409298718, "epoch": 0.3075356415478615, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "learning_rate": 1.751285867439885e-05, "loss": -0.0, "num_tokens": 2561888.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.12077120132744312, "epoch": 0.30821452817379497, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "learning_rate": 1.7497189430643025e-05, "loss": 0.0, "num_tokens": 2566961.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.068185078445822, "epoch": 0.3088934147997284, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "learning_rate": 1.7481478039925784e-05, "loss": 0.0, "num_tokens": 2576025.0, "reward": 2.0714287757873535, "reward_std": 0.3818017840385437, "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.07393559068441391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.06827958533540368, "epoch": 0.3095723014256619, "frac_reward_zero_std": 0.0, "grad_norm": 3.203125, "learning_rate": 1.746572459057188e-05, "loss": 0.0, "num_tokens": 2580603.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.09161295369267464, "epoch": 0.3102511880515954, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "learning_rate": 1.7449929171142495e-05, "loss": -0.0, "num_tokens": 2588528.0, "reward": 2.5357141494750977, "reward_std": 0.21257825195789337, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.21257825195789337, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.08320469036698341, "epoch": 0.31093007467752887, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7434091870434772e-05, "loss": 0.0, "num_tokens": 2597376.0, "reward": 2.5999999046325684, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.041240944527089596, "epoch": 0.31160896130346233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.741821277748128e-05, "loss": 0.0, "num_tokens": 2602611.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 235.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.04695710772648454, "epoch": 0.3122878479293958, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "learning_rate": 1.740229198154955e-05, "loss": 0.0, "num_tokens": 2608537.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 92.375, "completions/mean_terminated_length": 92.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.04123246343806386, "epoch": 0.31296673455532925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.738632957214154e-05, "loss": 0.0, "num_tokens": 2612412.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06342339888215065, "epoch": 0.3136456211812627, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "learning_rate": 1.737032563899315e-05, "loss": -0.0, "num_tokens": 2616562.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.04811230581253767, "epoch": 0.3143245078071962, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "learning_rate": 1.7354280272073718e-05, "loss": -0.0, "num_tokens": 2621911.0, "reward": 1.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 302.75, "completions/mean_terminated_length": 302.75, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.06339940801262856, "epoch": 0.3150033944331297, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "learning_rate": 1.7338193561585507e-05, "loss": -0.0, "num_tokens": 2628821.0, "reward": 2.75, "reward_std": 0.4178554117679596, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2920915186405182, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 178.5, "completions/mean_terminated_length": 178.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.05791258532553911, "epoch": 0.31568228105906315, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "learning_rate": 1.7322065597963206e-05, "loss": 0.0, "num_tokens": 2633729.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07732136873528361, "epoch": 0.3163611676849966, "frac_reward_zero_std": 0.0, "grad_norm": 7.09375, "learning_rate": 1.730589647187341e-05, "loss": 0.0, "num_tokens": 2637703.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 124.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.048830126877874136, "epoch": 0.3170400543109301, "frac_reward_zero_std": 0.0, "grad_norm": 3.53125, "learning_rate": 1.7289686274214116e-05, "loss": 0.0, "num_tokens": 2641769.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.04089064942672849, "epoch": 0.31771894093686354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7273435096114223e-05, "loss": 0.0, "num_tokens": 2647033.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.06483722059056163, "epoch": 0.318397827562797, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7257143028933004e-05, "loss": 0.0, "num_tokens": 2651533.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 243.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.023649835726246238, "epoch": 0.31907671418873046, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "learning_rate": 1.7240810164259597e-05, "loss": 0.0, "num_tokens": 2657881.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.09029651340097189, "epoch": 0.319755600814664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.722443659391249e-05, "loss": 0.0, "num_tokens": 2663233.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.03758882568217814, "epoch": 0.32043448744059744, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "learning_rate": 1.7208022409939012e-05, "loss": -0.0, "num_tokens": 2667702.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 354.25, "completions/mean_terminated_length": 354.25, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.06241106940433383, "epoch": 0.3211133740665309, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "learning_rate": 1.7191567704614806e-05, "loss": 0.0, "num_tokens": 2675736.0, "reward": 1.8303571939468384, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.8303571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.056640565395355225, "epoch": 0.32179226069246436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.717507257044331e-05, "loss": 0.0, "num_tokens": 2679899.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.09125719266012311, "epoch": 0.3224711473183978, "frac_reward_zero_std": 0.0, "grad_norm": 4.3125, "learning_rate": 1.7158537100155256e-05, "loss": 0.0, "num_tokens": 2684318.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.07897145766764879, "epoch": 0.3231500339443313, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "learning_rate": 1.714196138670811e-05, "loss": -0.0, "num_tokens": 2688835.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 300.25, "completions/mean_terminated_length": 300.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.046984167071059346, "epoch": 0.32382892057026474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7125345523285598e-05, "loss": 0.0, "num_tokens": 2696221.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.07721525803208351, "epoch": 0.32450780719619826, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "learning_rate": 1.7108689603297134e-05, "loss": -0.0, "num_tokens": 2700705.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 141.625, "completions/mean_terminated_length": 141.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.02691166941076517, "epoch": 0.3251866938221317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7091993720377336e-05, "loss": 0.0, "num_tokens": 2705078.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.052031297236680984, "epoch": 0.3258655804480652, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "learning_rate": 1.7075257968385472e-05, "loss": -0.0, "num_tokens": 2709472.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 111.125, "completions/mean_terminated_length": 111.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.04639151692390442, "epoch": 0.32654446707399865, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7058482441404946e-05, "loss": 0.0, "num_tokens": 2713473.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 158.75, "completions/mean_terminated_length": 158.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.09368814900517464, "epoch": 0.3272233536999321, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "learning_rate": 1.7041667233742763e-05, "loss": 0.0, "num_tokens": 2717935.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 94.0, "completions/mean_terminated_length": 94.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.03158421581611037, "epoch": 0.32790224032586557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7024812439929004e-05, "loss": 0.0, "num_tokens": 2721847.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.054124286863952875, "epoch": 0.32858112695179903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7007918154716286e-05, "loss": 0.0, "num_tokens": 2728443.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.04601641371846199, "epoch": 0.3292600135777325, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "learning_rate": 1.6990984473079245e-05, "loss": -0.0, "num_tokens": 2733616.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.02938453876413405, "epoch": 0.329938900203666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6974011490213976e-05, "loss": 0.0, "num_tokens": 2740064.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.0769786462187767, "epoch": 0.33061778682959947, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "learning_rate": 1.6956999301537533e-05, "loss": 0.0, "num_tokens": 2744553.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 72.375, "completions/mean_terminated_length": 72.375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.04711468797177076, "epoch": 0.33129667345553293, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6939948002687352e-05, "loss": 0.0, "num_tokens": 2748212.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.03385374881327152, "epoch": 0.3319755600814664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.692285768952076e-05, "loss": 0.0, "num_tokens": 2752554.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.04587622079998255, "epoch": 0.33265444670739985, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "learning_rate": 1.6905728458114384e-05, "loss": -0.0, "num_tokens": 2759175.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.10464992513880134, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "learning_rate": 1.6888560404763656e-05, "loss": -0.0, "num_tokens": 2763972.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07443650905042887, "epoch": 0.3340122199592668, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "learning_rate": 1.687135362598225e-05, "loss": -0.0, "num_tokens": 2768210.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.10057664709165692, "epoch": 0.3346911065852003, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "learning_rate": 1.6854108218501534e-05, "loss": -0.0, "num_tokens": 2775278.0, "reward": 2.8214285373687744, "reward_std": 0.16642355918884277, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.16642354428768158, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.1446738326922059, "epoch": 0.33536999321113375, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "learning_rate": 1.6836824279270053e-05, "loss": -0.0, "num_tokens": 2780538.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 98.875, "completions/mean_terminated_length": 98.875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.032744155963882804, "epoch": 0.3360488798370672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6819501905452945e-05, "loss": 0.0, "num_tokens": 2784465.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.06475615315139294, "epoch": 0.3367277664630007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.680214119443143e-05, "loss": 0.0, "num_tokens": 2788889.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 210.125, "completions/mean_terminated_length": 210.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06889946572482586, "epoch": 0.33740665308893414, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "learning_rate": 1.6784742243802242e-05, "loss": -0.0, "num_tokens": 2794378.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.053773445542901754, "epoch": 0.3380855397148676, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "learning_rate": 1.676730515137709e-05, "loss": 0.0, "num_tokens": 2800239.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.05142463184893131, "epoch": 0.33876442634080106, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6749830015182106e-05, "loss": 0.0, "num_tokens": 2804583.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.04364967648871243, "epoch": 0.3394433129667346, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "learning_rate": 1.673231693345729e-05, "loss": -0.0, "num_tokens": 2810617.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 500 }, { "epoch": 0.3394433129667346, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 224.87262872628727, "eval_completions/max_terminated_length": 224.87262872628727, "eval_completions/mean_length": 194.9481707317073, "eval_completions/mean_terminated_length": 194.9481707317073, "eval_completions/min_length": 164.2520325203252, "eval_completions/min_terminated_length": 164.2520325203252, "eval_entropy": 0.05976972838505335, "eval_frac_reward_zero_std": 0.4742547425474255, "eval_num_tokens": 2810617.0, "eval_reward": 1.9986720918639889, "eval_reward_std": 0.20666620964159163, "eval_rewards/fixed_code_pass_all_test_reward/mean": 0.6794534814632359, "eval_rewards/fixed_code_pass_all_test_reward/std": 0.12145954475493288, "eval_rewards/format_reward/mean": 0.9915311653116531, "eval_rewards/format_reward/std": 0.01811231022604759, "eval_rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3276874435545629, "eval_rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08238629815055103, "eval_train_loss": -0.003072693943977356, "eval_train_runtime": 1018.8467, "eval_train_samples_per_second": 0.362, "eval_train_steps_per_second": 0.046, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.04556558933109045, "epoch": 0.34012219959266804, "frac_reward_zero_std": 0.0, "grad_norm": 4.78125, "learning_rate": 1.6714766004655952e-05, "loss": 0.0, "num_tokens": 2815309.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.017006220878101885, "epoch": 0.3408010862186015, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "learning_rate": 1.6697177327444185e-05, "loss": 0.0, "num_tokens": 2821332.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.029887779848650098, "epoch": 0.34147997284453496, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "learning_rate": 1.6679551000700277e-05, "loss": 0.0, "num_tokens": 2827763.0, "reward": 2.34375, "reward_std": 0.0578637570142746, "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07232411252334714, "epoch": 0.3421588594704684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6661887123514183e-05, "loss": 0.0, "num_tokens": 2832828.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07752927113324404, "epoch": 0.3428377460964019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6644185795186946e-05, "loss": 0.0, "num_tokens": 2836932.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.053660436533391476, "epoch": 0.34351663272233535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.662644711523014e-05, "loss": 0.0, "num_tokens": 2841335.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.044008327182382345, "epoch": 0.34419551934826886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.660867118336535e-05, "loss": 0.0, "num_tokens": 2848388.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 232.75, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.08285042317584157, "epoch": 0.3448744059742023, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "learning_rate": 1.6590858099523545e-05, "loss": 0.0, "num_tokens": 2854106.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 110.0, "completions/mean_terminated_length": 110.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.051351450849324465, "epoch": 0.3455532926001358, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.657300796384457e-05, "loss": 0.0, "num_tokens": 2858162.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.0495811328291893, "epoch": 0.34623217922606925, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "learning_rate": 1.6555120876676557e-05, "loss": -0.0, "num_tokens": 2862753.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 137.0, "completions/mean_terminated_length": 137.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.04050322901457548, "epoch": 0.3469110658520027, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6537196938575376e-05, "loss": 0.0, "num_tokens": 2867369.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06908958125859499, "epoch": 0.34758995247793617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6519236250304058e-05, "loss": 0.0, "num_tokens": 2871837.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.06147625343874097, "epoch": 0.34826883910386963, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "learning_rate": 1.6501238912832226e-05, "loss": 0.0, "num_tokens": 2877115.0, "reward": 2.7916667461395264, "reward_std": 0.17251639068126678, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.17251639068126678, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.06666107149794698, "epoch": 0.34894772572980315, "frac_reward_zero_std": 0.0, "grad_norm": 3.828125, "learning_rate": 1.648320502733555e-05, "loss": 0.0, "num_tokens": 2882445.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.05938845733180642, "epoch": 0.3496266123557366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.646513469519514e-05, "loss": 0.0, "num_tokens": 2886767.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 177.5, "completions/mean_terminated_length": 177.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.037987842690199614, "epoch": 0.35030549898167007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.644702801799702e-05, "loss": 0.0, "num_tokens": 2892043.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 254.75, "completions/mean_terminated_length": 254.75, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.021649083821102977, "epoch": 0.35098438560760353, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "learning_rate": 1.6428885097531524e-05, "loss": 0.0, "num_tokens": 2898497.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.08299192041158676, "epoch": 0.351663272233537, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.641070603579273e-05, "loss": 0.0, "num_tokens": 2903064.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.06629437580704689, "epoch": 0.35234215885947046, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "learning_rate": 1.63924909349779e-05, "loss": 0.0, "num_tokens": 2907785.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.05698850331827998, "epoch": 0.3530210454854039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.63742398974869e-05, "loss": 0.0, "num_tokens": 2912497.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.040736530791036785, "epoch": 0.35369993211133743, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6355953025921606e-05, "loss": 0.0, "num_tokens": 2917745.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.09599796775728464, "epoch": 0.3543788187372709, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "learning_rate": 1.633763042308536e-05, "loss": 0.0, "num_tokens": 2922379.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 128.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.08285710355266929, "epoch": 0.35505770536320436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6319272191982364e-05, "loss": 0.0, "num_tokens": 2926643.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 130.25, "completions/mean_terminated_length": 130.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.05148724466562271, "epoch": 0.3557365919891378, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "learning_rate": 1.6300878435817115e-05, "loss": 0.0, "num_tokens": 2931189.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.08176072407513857, "epoch": 0.3564154786150713, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "learning_rate": 1.6282449257993814e-05, "loss": -0.0, "num_tokens": 2936751.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 229.875, "completions/mean_terminated_length": 229.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.04647241858765483, "epoch": 0.35709436524100474, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "learning_rate": 1.626398476211581e-05, "loss": -0.0, "num_tokens": 2943334.0, "reward": 2.2638888359069824, "reward_std": 0.5276733040809631, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.05590016767382622, "epoch": 0.3577732518669382, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "learning_rate": 1.624548505198498e-05, "loss": 0.0, "num_tokens": 2947610.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 340.75, "completions/mean_terminated_length": 340.75, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.033854235894978046, "epoch": 0.35845213849287166, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "learning_rate": 1.622695023160117e-05, "loss": 0.0, "num_tokens": 2955552.0, "reward": 2.21875, "reward_std": 0.8705242872238159, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.03326621069572866, "epoch": 0.3591310251188052, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "learning_rate": 1.6208380405161623e-05, "loss": 0.0, "num_tokens": 2962468.0, "reward": 2.9250001907348633, "reward_std": 0.14880476891994476, "rewards/fixed_code_pass_all_test_reward/mean": 0.9249999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.14880475401878357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 118.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.0327159590087831, "epoch": 0.35980991174473864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6189775677060347e-05, "loss": 0.0, "num_tokens": 2967104.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.057204945711418986, "epoch": 0.3604887983706721, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "learning_rate": 1.6171136151887577e-05, "loss": 0.0, "num_tokens": 2971609.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 115.5, "completions/mean_terminated_length": 115.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.029368214774876833, "epoch": 0.36116768499660556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6152461934429154e-05, "loss": 0.0, "num_tokens": 2975621.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 259.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.030925868079066277, "epoch": 0.361846571622539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6133753129665968e-05, "loss": 0.0, "num_tokens": 2982615.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.02856113645248115, "epoch": 0.3625254582484725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6115009842773322e-05, "loss": 0.0, "num_tokens": 2988836.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.057596494909375906, "epoch": 0.36320434487440595, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "learning_rate": 1.6096232179120388e-05, "loss": 0.0, "num_tokens": 2994040.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 94.625, "completions/mean_terminated_length": 94.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.05044847633689642, "epoch": 0.36388323150033947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6077420244269585e-05, "loss": 0.0, "num_tokens": 2998293.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.08904850669205189, "epoch": 0.3645621181262729, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "learning_rate": 1.6058574143975995e-05, "loss": -0.0, "num_tokens": 3003394.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05029344651848078, "epoch": 0.3652410047522064, "frac_reward_zero_std": 0.0, "grad_norm": 4.25, "learning_rate": 1.603969398418677e-05, "loss": -0.0, "num_tokens": 3007625.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 80.75, "completions/mean_terminated_length": 80.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.045220422092825174, "epoch": 0.36591989137813985, "frac_reward_zero_std": 0.0, "grad_norm": 4.1875, "learning_rate": 1.6020779871040538e-05, "loss": 0.0, "num_tokens": 3011431.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 343.25, "completions/mean_terminated_length": 343.25, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.09786451607942581, "epoch": 0.3665987780040733, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "learning_rate": 1.6001831910866795e-05, "loss": -0.0, "num_tokens": 3019889.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.058186124078929424, "epoch": 0.3672776646300068, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "learning_rate": 1.5982850210185313e-05, "loss": -0.0, "num_tokens": 3023961.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.050074026454240084, "epoch": 0.36795655125594023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5963834875705556e-05, "loss": 0.0, "num_tokens": 3028015.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.04945247480645776, "epoch": 0.36863543788187375, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "learning_rate": 1.5944786014326053e-05, "loss": 0.0, "num_tokens": 3033024.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.0497220391407609, "epoch": 0.3693143245078072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5925703733133823e-05, "loss": 0.0, "num_tokens": 3038064.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 98.625, "completions/mean_terminated_length": 98.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.060638073831796646, "epoch": 0.3699932111337407, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "learning_rate": 1.5906588139403752e-05, "loss": 0.0, "num_tokens": 3042069.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 89.125, "completions/mean_terminated_length": 89.125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.03359708562493324, "epoch": 0.37067209775967414, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5887439340598002e-05, "loss": 0.0, "num_tokens": 3045998.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06563668977469206, "epoch": 0.3713509843856076, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "learning_rate": 1.5868257444365408e-05, "loss": -0.0, "num_tokens": 3051579.0, "reward": 2.9166665077209473, "reward_std": 0.12598812580108643, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.1259881556034088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 90.875, "completions/mean_terminated_length": 90.875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.10051706479862332, "epoch": 0.37202987101154106, "frac_reward_zero_std": 0.0, "grad_norm": 6.875, "learning_rate": 1.5849042558540863e-05, "loss": 0.0, "num_tokens": 3055402.0, "reward": 1.75, "reward_std": 1.0350983142852783, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.08418468851596117, "epoch": 0.3727087576374745, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "learning_rate": 1.5829794791144723e-05, "loss": 0.0, "num_tokens": 3060038.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 107.625, "completions/mean_terminated_length": 107.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.08815432898700237, "epoch": 0.37338764426340804, "frac_reward_zero_std": 0.0, "grad_norm": 4.9375, "learning_rate": 1.581051425038219e-05, "loss": 0.0, "num_tokens": 3064123.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 97.875, "completions/mean_terminated_length": 97.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.030117509653791785, "epoch": 0.3740665308893415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5791201044642707e-05, "loss": 0.0, "num_tokens": 3067986.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.05088119860738516, "epoch": 0.37474541751527496, "frac_reward_zero_std": 0.0, "grad_norm": 3.21875, "learning_rate": 1.577185528249936e-05, "loss": -0.0, "num_tokens": 3073108.0, "reward": 2.875, "reward_std": 0.1157275140285492, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 198.5, "completions/mean_terminated_length": 198.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.020021514268592, "epoch": 0.3754243041412084, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "learning_rate": 1.5752477072708247e-05, "loss": 0.0, "num_tokens": 3078896.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 163.5, "completions/mean_terminated_length": 163.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.11634811153635383, "epoch": 0.3761031907671419, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "learning_rate": 1.5733066524207875e-05, "loss": 0.0, "num_tokens": 3083404.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 128.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.08624893147498369, "epoch": 0.37678207739307534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5713623746118558e-05, "loss": 0.0, "num_tokens": 3087663.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 79.25, "completions/mean_terminated_length": 79.25, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.03172642434947193, "epoch": 0.3774609640190088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5694148847741793e-05, "loss": 0.0, "num_tokens": 3091553.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 115.5, "completions/mean_terminated_length": 115.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.06022269558161497, "epoch": 0.3781398506449423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5674641938559644e-05, "loss": 0.0, "num_tokens": 3095981.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.04809736763127148, "epoch": 0.3788187372708758, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "learning_rate": 1.5655103128234134e-05, "loss": 0.0, "num_tokens": 3100423.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 118.25, "completions/mean_terminated_length": 118.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.019930118694901466, "epoch": 0.37949762389680924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5635532526606625e-05, "loss": 0.0, "num_tokens": 3104593.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 354.125, "completions/mean_terminated_length": 354.125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 0.015033581294119358, "epoch": 0.3801765105227427, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5615930243697196e-05, "loss": 0.0, "num_tokens": 3112714.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 106.375, "completions/mean_terminated_length": 106.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.04087465349584818, "epoch": 0.38085539714867617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.559629638970403e-05, "loss": 0.0, "num_tokens": 3116949.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.04973500198684633, "epoch": 0.38153428377460963, "frac_reward_zero_std": 0.0, "grad_norm": 6.53125, "learning_rate": 1.5576631075002796e-05, "loss": -0.0, "num_tokens": 3121301.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.03898005420342088, "epoch": 0.3822131704005431, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5556934410146024e-05, "loss": 0.0, "num_tokens": 3126841.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 105.25, "completions/mean_terminated_length": 105.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.040712617337703705, "epoch": 0.38289205702647655, "frac_reward_zero_std": 0.0, "grad_norm": 7.125, "learning_rate": 1.5537206505862486e-05, "loss": 0.0, "num_tokens": 3130899.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.02091029309667647, "epoch": 0.38357094365241007, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "learning_rate": 1.5517447473056568e-05, "loss": -0.0, "num_tokens": 3136817.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.09499801602214575, "epoch": 0.38424983027834353, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "learning_rate": 1.549765742280766e-05, "loss": 0.0, "num_tokens": 3141334.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.03466602601110935, "epoch": 0.384928716904277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5477836466369522e-05, "loss": 0.0, "num_tokens": 3145859.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 120.125, "completions/mean_terminated_length": 120.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.07435816619545221, "epoch": 0.38560760353021045, "frac_reward_zero_std": 0.0, "grad_norm": 4.875, "learning_rate": 1.5457984715169643e-05, "loss": 0.0, "num_tokens": 3150124.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.032860161969438195, "epoch": 0.3862864901561439, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "learning_rate": 1.5438102280808653e-05, "loss": -0.0, "num_tokens": 3157192.0, "reward": 1.7321428060531616, "reward_std": 0.36967799067497253, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.36967799067497253, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07669706363230944, "epoch": 0.3869653767820774, "frac_reward_zero_std": 0.0, "grad_norm": 3.359375, "learning_rate": 1.541818927505966e-05, "loss": 0.0, "num_tokens": 3161358.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 137.125, "completions/mean_terminated_length": 137.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.08569513633847237, "epoch": 0.38764426340801084, "frac_reward_zero_std": 0.0, "grad_norm": 4.90625, "learning_rate": 1.5398245809867643e-05, "loss": -0.0, "num_tokens": 3165751.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 109.125, "completions/mean_terminated_length": 109.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06928400369361043, "epoch": 0.38832315003394435, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.537827199734881e-05, "loss": 0.0, "num_tokens": 3169896.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 119.375, "completions/mean_terminated_length": 119.375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.07419133419170976, "epoch": 0.3890020366598778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5358267949789968e-05, "loss": 0.0, "num_tokens": 3174003.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 177.625, "completions/mean_terminated_length": 177.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.04335285141132772, "epoch": 0.3896809232858113, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "learning_rate": 1.533823377964791e-05, "loss": 0.0, "num_tokens": 3179096.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.0432195570319891, "epoch": 0.39035980991174474, "frac_reward_zero_std": 0.0, "grad_norm": 4.09375, "learning_rate": 1.5318169599548755e-05, "loss": 0.0, "num_tokens": 3183874.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.04805750073865056, "epoch": 0.3910386965376782, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "learning_rate": 1.529807552228734e-05, "loss": 0.0, "num_tokens": 3188401.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 98.25, "completions/mean_terminated_length": 98.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.019100099802017212, "epoch": 0.39171758316361166, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5277951660826568e-05, "loss": 0.0, "num_tokens": 3192443.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.02586966985836625, "epoch": 0.3923964697895451, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "learning_rate": 1.5257798128296783e-05, "loss": -0.0, "num_tokens": 3198686.0, "reward": 1.75, "reward_std": 0.20701964199543, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.2070196568965912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 390.25, "completions/mean_terminated_length": 390.25, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "entropy": 0.020898035378195345, "epoch": 0.39307535641547864, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "learning_rate": 1.5237615037995129e-05, "loss": -0.0, "num_tokens": 3207496.0, "reward": 1.918269157409668, "reward_std": 0.013598186895251274, "rewards/fixed_code_pass_all_test_reward/mean": 0.9182692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.013598217628896236, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.030841318890452385, "epoch": 0.3937542430414121, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "learning_rate": 1.5217402503384914e-05, "loss": 0.0, "num_tokens": 3212150.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 120.375, "completions/mean_terminated_length": 120.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.04560302849858999, "epoch": 0.39443312966734556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5197160638094981e-05, "loss": 0.0, "num_tokens": 3216569.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 106.375, "completions/mean_terminated_length": 106.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.07175793964415789, "epoch": 0.395112016293279, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.517688955591905e-05, "loss": 0.0, "num_tokens": 3220652.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.032216466031968594, "epoch": 0.3957909029192125, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "learning_rate": 1.5156589370815096e-05, "loss": -0.0, "num_tokens": 3227783.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 342.375, "completions/mean_terminated_length": 342.375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.03005579300224781, "epoch": 0.39646978954514595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5136260196904704e-05, "loss": 0.0, "num_tokens": 3235618.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.019725291756913066, "epoch": 0.3971486761710794, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "learning_rate": 1.5115902148472418e-05, "loss": 0.0, "num_tokens": 3239919.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 299.125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.025002469774335623, "epoch": 0.3978275627970129, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "learning_rate": 1.5095515339965117e-05, "loss": -0.0, "num_tokens": 3247600.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 96.25, "completions/mean_terminated_length": 96.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.12215751688927412, "epoch": 0.3985064494229464, "frac_reward_zero_std": 0.0, "grad_norm": 6.5, "learning_rate": 1.5075099885991345e-05, "loss": 0.0, "num_tokens": 3251562.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 110.125, "completions/mean_terminated_length": 110.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.03420641226693988, "epoch": 0.39918533604887985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5054655901320697e-05, "loss": 0.0, "num_tokens": 3255587.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.06157191004604101, "epoch": 0.3998642226748133, "frac_reward_zero_std": 0.0, "grad_norm": 4.21875, "learning_rate": 1.5034183500883153e-05, "loss": 0.0, "num_tokens": 3260975.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 84.125, "completions/mean_terminated_length": 84.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.017459457740187645, "epoch": 0.40054310930074677, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5013682799768435e-05, "loss": 0.0, "num_tokens": 3264864.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.02103763446211815, "epoch": 0.40122199592668023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4993153913225374e-05, "loss": 0.0, "num_tokens": 3269136.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.0377178059425205, "epoch": 0.4019008825526137, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "learning_rate": 1.4972596956661229e-05, "loss": 0.0, "num_tokens": 3273642.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 96.0, "completions/mean_terminated_length": 96.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.04918831679970026, "epoch": 0.4025797691785472, "frac_reward_zero_std": 0.0, "grad_norm": 3.6875, "learning_rate": 1.495201204564109e-05, "loss": -0.0, "num_tokens": 3277570.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.013917091069743037, "epoch": 0.40325865580448067, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4931399295887172e-05, "loss": 0.0, "num_tokens": 3281900.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 314.0, "completions/mean_terminated_length": 314.0, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.016182406223379076, "epoch": 0.40393754243041413, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4910758823278208e-05, "loss": 0.0, "num_tokens": 3289132.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.012484532431699336, "epoch": 0.4046164290563476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4890090743848774e-05, "loss": 0.0, "num_tokens": 3295609.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.04672857653349638, "epoch": 0.40529531568228105, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "learning_rate": 1.4869395173788642e-05, "loss": 0.0, "num_tokens": 3299688.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 86.25, "completions/mean_terminated_length": 86.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.03325698827393353, "epoch": 0.4059742023082145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4848672229442132e-05, "loss": 0.0, "num_tokens": 3303602.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.022395170526579022, "epoch": 0.406653088934148, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "learning_rate": 1.482792202730745e-05, "loss": -0.0, "num_tokens": 3311398.0, "reward": 2.580357074737549, "reward_std": 0.3975829482078552, "rewards/fixed_code_pass_all_test_reward/mean": 0.7053571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.12334916740655899, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.026530831586569548, "epoch": 0.4073319755600815, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "learning_rate": 1.4807144684036044e-05, "loss": 0.0, "num_tokens": 3317511.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 355.25, "completions/mean_terminated_length": 355.25, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.012836731621064246, "epoch": 0.40801086218601496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4786340316431931e-05, "loss": 0.0, "num_tokens": 3325617.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.012336520594544709, "epoch": 0.4086897488119484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.476550904145106e-05, "loss": 0.0, "num_tokens": 3332034.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.02567755780182779, "epoch": 0.4093686354378819, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "learning_rate": 1.4744650976200643e-05, "loss": -0.0, "num_tokens": 3336374.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.016870889579877257, "epoch": 0.41004752206381534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4723766237938495e-05, "loss": 0.0, "num_tokens": 3342339.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 79.125, "completions/mean_terminated_length": 79.125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.06487983511760831, "epoch": 0.4107264086897488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4702854944072383e-05, "loss": 0.0, "num_tokens": 3346076.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 89.625, "completions/mean_terminated_length": 89.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.04705160157755017, "epoch": 0.41140529531568226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4681917212159358e-05, "loss": 0.0, "num_tokens": 3349953.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 204.25, "completions/mean_terminated_length": 204.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06749026523903012, "epoch": 0.4120841819416157, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "learning_rate": 1.46609531599051e-05, "loss": -0.0, "num_tokens": 3355131.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 62.875, "completions/mean_terminated_length": 62.875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.032150683691725135, "epoch": 0.41276306856754924, "frac_reward_zero_std": 0.0, "grad_norm": 5.78125, "learning_rate": 1.4639962905163258e-05, "loss": -0.0, "num_tokens": 3358690.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.07843731855973601, "epoch": 0.4134419551934827, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "learning_rate": 1.4618946565934775e-05, "loss": 0.0, "num_tokens": 3363998.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.024500891799107194, "epoch": 0.41412084181941616, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "learning_rate": 1.4597904260367239e-05, "loss": -0.0, "num_tokens": 3370777.0, "reward": 1.5, "reward_std": 1.6035674810409546, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.03512542974203825, "epoch": 0.4147997284453496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4576836106754213e-05, "loss": 0.0, "num_tokens": 3375055.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.039333144668489695, "epoch": 0.4154786150712831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.455574222353457e-05, "loss": 0.0, "num_tokens": 3378939.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.05375869292765856, "epoch": 0.41615750169721655, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.453462272929182e-05, "loss": 0.0, "num_tokens": 3383209.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 112.125, "completions/mean_terminated_length": 112.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.03742718417197466, "epoch": 0.41683638832315, "frac_reward_zero_std": 0.0, "grad_norm": 4.65625, "learning_rate": 1.4513477742753465e-05, "loss": 0.0, "num_tokens": 3387466.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.023294531973078847, "epoch": 0.4175152749490835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.44923073827903e-05, "loss": 0.0, "num_tokens": 3393548.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.05156797170639038, "epoch": 0.418194161575017, "frac_reward_zero_std": 0.0, "grad_norm": 3.734375, "learning_rate": 1.4471111768415777e-05, "loss": 0.0, "num_tokens": 3397854.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 90.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.008384114131331444, "epoch": 0.41887304820095045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.444989101878531e-05, "loss": 0.0, "num_tokens": 3401630.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 79.25, "completions/mean_terminated_length": 79.25, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.08051333297044039, "epoch": 0.4195519348268839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4428645253195621e-05, "loss": 0.0, "num_tokens": 3405480.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 90.5, "completions/mean_terminated_length": 90.5, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.04123400757089257, "epoch": 0.42023082145281737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4407374591084064e-05, "loss": 0.0, "num_tokens": 3409412.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 411.375, "completions/mean_terminated_length": 411.375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.031261581694707274, "epoch": 0.42090970807875083, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "learning_rate": 1.4386079152027952e-05, "loss": -0.0, "num_tokens": 3419055.0, "reward": 2.4749999046325684, "reward_std": 0.3535533547401428, "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.0838325722143054, "epoch": 0.4215885947046843, "frac_reward_zero_std": 0.0, "grad_norm": 3.640625, "learning_rate": 1.4364759055743888e-05, "loss": 0.0, "num_tokens": 3423139.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.03678737487643957, "epoch": 0.4222674813306178, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4343414422087093e-05, "loss": 0.0, "num_tokens": 3427379.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.0567446150816977, "epoch": 0.4229463679565513, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "learning_rate": 1.4322045371050722e-05, "loss": 0.0, "num_tokens": 3432369.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 105.75, "completions/mean_terminated_length": 105.75, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.03274666238576174, "epoch": 0.42362525458248473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4300652022765207e-05, "loss": 0.0, "num_tokens": 3436327.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.021755606052465737, "epoch": 0.4243041412084182, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4279234497497563e-05, "loss": 0.0, "num_tokens": 3442166.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.011844113236293197, "epoch": 0.42498302783435166, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "learning_rate": 1.4257792915650728e-05, "loss": 0.0, "num_tokens": 3450672.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.025760697433725, "epoch": 0.4256619144602851, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "learning_rate": 1.4236327397762874e-05, "loss": 0.0, "num_tokens": 3454956.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 68.0, "completions/mean_terminated_length": 68.0, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.028030681889504194, "epoch": 0.4263408010862186, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4214838064506738e-05, "loss": 0.0, "num_tokens": 3458564.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.060217094607651234, "epoch": 0.4270196877121521, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.419332503668894e-05, "loss": 0.0, "num_tokens": 3463919.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 146.375, "completions/mean_terminated_length": 146.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.0345245657954365, "epoch": 0.42769857433808556, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "learning_rate": 1.417178843524929e-05, "loss": 0.0, "num_tokens": 3468546.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.04785546218045056, "epoch": 0.428377460964019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.415022838126015e-05, "loss": 0.0, "num_tokens": 3476161.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.023954134434461594, "epoch": 0.4290563475899525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4128644995925696e-05, "loss": 0.0, "num_tokens": 3480321.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 228.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.03332836041226983, "epoch": 0.42973523421588594, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "learning_rate": 1.4107038400581288e-05, "loss": 0.0, "num_tokens": 3486368.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 308.125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "entropy": 0.009603998390957713, "epoch": 0.4304141208418194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.408540871669275e-05, "loss": 0.0, "num_tokens": 3493497.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.028482120716944337, "epoch": 0.43109300746775286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4063756065855714e-05, "loss": 0.0, "num_tokens": 3498419.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.041112816194072366, "epoch": 0.4317718940936864, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "learning_rate": 1.4042080569794916e-05, "loss": 0.0, "num_tokens": 3503906.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 115.375, "completions/mean_terminated_length": 115.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.02200512622948736, "epoch": 0.43245078071961984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4020382350363528e-05, "loss": 0.0, "num_tokens": 3508085.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 226.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.006985355954384431, "epoch": 0.4331296673455533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3998661529542463e-05, "loss": 0.0, "num_tokens": 3514125.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 85.75, "completions/mean_terminated_length": 85.75, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.0492265815846622, "epoch": 0.43380855397148677, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3976918229439698e-05, "loss": 0.0, "num_tokens": 3517915.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 118.875, "completions/mean_terminated_length": 118.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.02465058397501707, "epoch": 0.4344874405974202, "frac_reward_zero_std": 0.0, "grad_norm": 4.40625, "learning_rate": 1.3955152572289568e-05, "loss": 0.0, "num_tokens": 3522034.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.042808918049559, "epoch": 0.4351663272233537, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "learning_rate": 1.3933364680452106e-05, "loss": -0.0, "num_tokens": 3527596.0, "reward": 2.9791665077209473, "reward_std": 0.058925628662109375, "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.05737858219072223, "epoch": 0.43584521384928715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.391155467641234e-05, "loss": 0.0, "num_tokens": 3531767.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 109.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.08171014487743378, "epoch": 0.4365241004752206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3889722682779598e-05, "loss": 0.0, "num_tokens": 3536041.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 117.75, "completions/mean_terminated_length": 117.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.10145062673836946, "epoch": 0.43720298710115413, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3867868822286838e-05, "loss": 0.0, "num_tokens": 3540071.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 151.75, "completions/mean_terminated_length": 151.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.05416981549933553, "epoch": 0.4378818737270876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3845993217789937e-05, "loss": 0.0, "num_tokens": 3544709.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.013641998521052301, "epoch": 0.43856076035302105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3824095992267017e-05, "loss": 0.0, "num_tokens": 3551223.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 309.875, "completions/mean_terminated_length": 309.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "entropy": 0.02465397259220481, "epoch": 0.4392396469789545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3802177268817742e-05, "loss": 0.0, "num_tokens": 3558678.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 119.125, "completions/mean_terminated_length": 119.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.016694259480573237, "epoch": 0.439918533604888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3780237170662638e-05, "loss": 0.0, "num_tokens": 3562863.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.021779146045446396, "epoch": 0.44059742023082143, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "learning_rate": 1.3758275821142382e-05, "loss": -0.0, "num_tokens": 3566991.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 110.0, "completions/mean_terminated_length": 110.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.011041645426303148, "epoch": 0.4412763068567549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3736293343717134e-05, "loss": 0.0, "num_tokens": 3571287.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 346.5, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "entropy": 0.039802387123927474, "epoch": 0.4419551934826884, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "learning_rate": 1.3714289861965816e-05, "loss": -0.0, "num_tokens": 3579683.0, "reward": 2.125, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 108.125, "completions/mean_terminated_length": 108.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.0409787162207067, "epoch": 0.4426340801086219, "frac_reward_zero_std": 0.0, "grad_norm": 5.21875, "learning_rate": 1.3692265499585438e-05, "loss": -0.0, "num_tokens": 3583684.0, "reward": 2.125, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.08081695158034563, "epoch": 0.44331296673455534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3670220380390389e-05, "loss": 0.0, "num_tokens": 3588989.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.048724310006946325, "epoch": 0.4439918533604888, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "learning_rate": 1.3648154628311754e-05, "loss": 0.0, "num_tokens": 3595347.0, "reward": 2.25, "reward_std": 1.0350983142852783, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 104.125, "completions/mean_terminated_length": 104.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.023676771903410554, "epoch": 0.44467073998642226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3626068367396603e-05, "loss": 0.0, "num_tokens": 3599460.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.0415635141544044, "epoch": 0.4453496266123557, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "learning_rate": 1.3603961721807304e-05, "loss": -0.0, "num_tokens": 3603698.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 74.125, "completions/mean_terminated_length": 74.125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.047901921439915895, "epoch": 0.4460285132382892, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3581834815820817e-05, "loss": 0.0, "num_tokens": 3607539.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.023064984008669853, "epoch": 0.4467073998642227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3559687773828012e-05, "loss": 0.0, "num_tokens": 3613722.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 123.375, "completions/mean_terminated_length": 123.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.022970449412241578, "epoch": 0.44738628649015616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3537520720332943e-05, "loss": 0.0, "num_tokens": 3618021.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.08673943532630801, "epoch": 0.4480651731160896, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "learning_rate": 1.3515333779952169e-05, "loss": -0.0, "num_tokens": 3624404.0, "reward": 1.765625, "reward_std": 0.1043153703212738, "rewards/fixed_code_pass_all_test_reward/mean": 0.765625, "rewards/fixed_code_pass_all_test_reward/std": 0.1043153703212738, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.07286445517092943, "epoch": 0.4487440597420231, "frac_reward_zero_std": 0.0, "grad_norm": 5.40625, "learning_rate": 1.3493127077414046e-05, "loss": 0.0, "num_tokens": 3628479.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 233.125, "completions/mean_terminated_length": 233.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.05186600983142853, "epoch": 0.44942294636795654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3470900737558032e-05, "loss": 0.0, "num_tokens": 3635872.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.0088185541681014, "epoch": 0.45010183299389, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3448654885333974e-05, "loss": 0.0, "num_tokens": 3641848.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 93.0, "completions/mean_terminated_length": 93.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.08682233095169067, "epoch": 0.45078071961982347, "frac_reward_zero_std": 0.0, "grad_norm": 5.53125, "learning_rate": 1.3426389645801415e-05, "loss": -0.0, "num_tokens": 3646104.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 165.0, "completions/mean_terminated_length": 165.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.057350332383066416, "epoch": 0.451459606245757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3404105144128885e-05, "loss": 0.0, "num_tokens": 3651056.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.04245053534395993, "epoch": 0.45213849287169044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3381801505593201e-05, "loss": 0.0, "num_tokens": 3655588.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.0375438358169049, "epoch": 0.4528173794976239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3359478855578764e-05, "loss": 0.0, "num_tokens": 3661744.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 195.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.09043617080897093, "epoch": 0.45349626612355737, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "learning_rate": 1.333713731957685e-05, "loss": -0.0, "num_tokens": 3666997.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.053803281392902136, "epoch": 0.45417515274949083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3314777023184907e-05, "loss": 0.0, "num_tokens": 3671535.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.018220534548163414, "epoch": 0.4548540393754243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3292398092105842e-05, "loss": 0.0, "num_tokens": 3676020.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 76.375, "completions/mean_terminated_length": 76.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.025097382487729192, "epoch": 0.45553292600135775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3270000652147339e-05, "loss": 0.0, "num_tokens": 3679751.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 78.0, "completions/mean_terminated_length": 78.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.01580260810442269, "epoch": 0.45621181262729127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3247584829221104e-05, "loss": 0.0, "num_tokens": 3683519.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 78.625, "completions/mean_terminated_length": 78.625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.02852762583643198, "epoch": 0.45689069925322473, "frac_reward_zero_std": 0.0, "grad_norm": 4.21875, "learning_rate": 1.3225150749342222e-05, "loss": -0.0, "num_tokens": 3687188.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 137.125, "completions/mean_terminated_length": 137.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.0470118778757751, "epoch": 0.4575695858791582, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3202698538628376e-05, "loss": 0.0, "num_tokens": 3691525.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 108.5, "completions/mean_terminated_length": 108.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.0804237388074398, "epoch": 0.45824847250509165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.318022832329921e-05, "loss": 0.0, "num_tokens": 3695505.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 97.75, "completions/mean_terminated_length": 97.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.035810349974781275, "epoch": 0.4589273591310251, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3157740229675557e-05, "loss": 0.0, "num_tokens": 3699639.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 92.875, "completions/mean_terminated_length": 92.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.04038177663460374, "epoch": 0.4596062457569586, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3135234384178772e-05, "loss": 0.0, "num_tokens": 3703646.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 70.875, "completions/mean_terminated_length": 70.875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.041391176637262106, "epoch": 0.46028513238289204, "frac_reward_zero_std": 0.0, "grad_norm": 9.0, "learning_rate": 1.311271091333e-05, "loss": 0.0, "num_tokens": 3707301.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 344.625, "completions/mean_terminated_length": 344.625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "entropy": 0.015988030936568975, "epoch": 0.4609640190088255, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3090169943749475e-05, "loss": 0.0, "num_tokens": 3715154.0, "reward": 2.3636364936828613, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 202.375, "completions/mean_terminated_length": 202.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.04510409012436867, "epoch": 0.461642905634759, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3067611602155799e-05, "loss": 0.0, "num_tokens": 3720749.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.01797198981512338, "epoch": 0.4623217922606925, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "learning_rate": 1.3045036015365233e-05, "loss": 0.0, "num_tokens": 3726931.0, "reward": 2.25, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.05983824376016855, "epoch": 0.46300067888662594, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "learning_rate": 1.3022443310290993e-05, "loss": -0.0, "num_tokens": 3731218.0, "reward": 2.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.03132125805132091, "epoch": 0.4636795655125594, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.299983361394252e-05, "loss": 0.0, "num_tokens": 3735855.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.04651427315548062, "epoch": 0.46435845213849286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2977207053424781e-05, "loss": 0.0, "num_tokens": 3739940.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 102.375, "completions/mean_terminated_length": 102.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.055594713892787695, "epoch": 0.4650373387644263, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "learning_rate": 1.2954563755937546e-05, "loss": 0.0, "num_tokens": 3744183.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 405.625, "completions/mean_terminated_length": 405.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.02527920249849558, "epoch": 0.4657162253903598, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2931903848774676e-05, "loss": 0.0, "num_tokens": 3753524.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 135.75, "completions/mean_terminated_length": 135.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.08573331777006388, "epoch": 0.4663951120162933, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "learning_rate": 1.2909227459323403e-05, "loss": -0.0, "num_tokens": 3760474.0, "reward": 2.8214285373687744, "reward_std": 0.22587695717811584, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.22587695717811584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.02615616819821298, "epoch": 0.46707399864222676, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "learning_rate": 1.2886534715063626e-05, "loss": 0.0, "num_tokens": 3766273.0, "reward": 2.515625, "reward_std": 0.32346823811531067, "rewards/fixed_code_pass_all_test_reward/mean": 0.515625, "rewards/fixed_code_pass_all_test_reward/std": 0.32346823811531067, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 346.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.13608831726014614, "epoch": 0.4677528852681602, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "learning_rate": 1.2863825743567174e-05, "loss": 0.0, "num_tokens": 3773317.0, "reward": 2.0, "reward_std": 1.0690449476242065, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.08343358803540468, "epoch": 0.4684317718940937, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "learning_rate": 1.2841100672497116e-05, "loss": -0.0, "num_tokens": 3778023.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 93.125, "completions/mean_terminated_length": 93.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.05809004930779338, "epoch": 0.46911065852002715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2818359629607008e-05, "loss": 0.0, "num_tokens": 3782024.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 257.25, "completions/mean_terminated_length": 257.25, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.01703940413426608, "epoch": 0.4697895451459606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2795602742740217e-05, "loss": 0.0, "num_tokens": 3788274.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 101.875, "completions/mean_terminated_length": 101.875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.04470214154571295, "epoch": 0.47046843177189407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.277283013982916e-05, "loss": 0.0, "num_tokens": 3792169.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 125.875, "completions/mean_terminated_length": 125.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0648745964281261, "epoch": 0.4711473183978276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2750041948894621e-05, "loss": 0.0, "num_tokens": 3796408.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.09087196364998817, "epoch": 0.47182620502376105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2727238298045002e-05, "loss": 0.0, "num_tokens": 3801690.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 124.75, "completions/mean_terminated_length": 124.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.0700354422442615, "epoch": 0.4725050916496945, "frac_reward_zero_std": 0.0, "grad_norm": 4.53125, "learning_rate": 1.2704419315475629e-05, "loss": -0.0, "num_tokens": 3805960.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06748683424666524, "epoch": 0.47318397827562797, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2681585129468003e-05, "loss": 0.0, "num_tokens": 3810213.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 120.5, "completions/mean_terminated_length": 120.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.0440788296982646, "epoch": 0.47386286490156143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2658735868389113e-05, "loss": 0.0, "num_tokens": 3814433.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 64.875, "completions/mean_terminated_length": 64.875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.025556961772963405, "epoch": 0.4745417515274949, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2635871660690677e-05, "loss": 0.0, "num_tokens": 3818176.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 191.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.08459043968468904, "epoch": 0.47522063815342835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2612992634908454e-05, "loss": 0.0, "num_tokens": 3823413.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 162.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.04112151707522571, "epoch": 0.47589952477936187, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "learning_rate": 1.259009891966149e-05, "loss": -0.0, "num_tokens": 3828152.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.11485203076153994, "epoch": 0.47657841140529533, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "learning_rate": 1.2567190643651426e-05, "loss": -0.0, "num_tokens": 3832646.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.024960620794445276, "epoch": 0.4772572980312288, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "learning_rate": 1.2544267935661751e-05, "loss": -0.0, "num_tokens": 3840174.0, "reward": 2.3249998092651367, "reward_std": 0.41661909222602844, "rewards/fixed_code_pass_all_test_reward/mean": 0.32500001788139343, "rewards/fixed_code_pass_all_test_reward/std": 0.41661903262138367, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.062057614559307694, "epoch": 0.47793618465716226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2521330924557087e-05, "loss": 0.0, "num_tokens": 3844552.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.04860835149884224, "epoch": 0.4786150712830957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2498379739282465e-05, "loss": 0.0, "num_tokens": 3848698.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.033762684324756265, "epoch": 0.4792939579090292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2475414508862598e-05, "loss": 0.0, "num_tokens": 3854675.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 161.0, "completions/mean_terminated_length": 161.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.04638125002384186, "epoch": 0.47997284453496264, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "learning_rate": 1.2452435362401161e-05, "loss": 0.0, "num_tokens": 3859595.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 70.25, "completions/mean_terminated_length": 70.25, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.03012160398066044, "epoch": 0.48065173116089616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2429442429080054e-05, "loss": 0.0, "num_tokens": 3863397.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.02008997299708426, "epoch": 0.4813306177868296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2406435838158686e-05, "loss": 0.0, "num_tokens": 3874049.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 94.5, "completions/mean_terminated_length": 94.5, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.03085462236776948, "epoch": 0.4820095044127631, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2383415718973245e-05, "loss": 0.0, "num_tokens": 3878021.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 242.75, "completions/mean_terminated_length": 242.75, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.03501976956613362, "epoch": 0.48268839103869654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2360382200935971e-05, "loss": 0.0, "num_tokens": 3884267.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.1122683584690094, "epoch": 0.48336727766463, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "learning_rate": 1.2337335413534428e-05, "loss": 0.0, "num_tokens": 3888931.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 75.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.02884731814265251, "epoch": 0.48404616429056346, "frac_reward_zero_std": 0.0, "grad_norm": 7.1875, "learning_rate": 1.2314275486330778e-05, "loss": -0.0, "num_tokens": 3892627.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.0856550857424736, "epoch": 0.4847250509164969, "frac_reward_zero_std": 0.0, "grad_norm": 3.328125, "learning_rate": 1.2291202548961042e-05, "loss": 0.0, "num_tokens": 3897905.0, "reward": 1.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 126.875, "completions/mean_terminated_length": 126.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.0559721770696342, "epoch": 0.48540393754243044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.22681167311344e-05, "loss": 0.0, "num_tokens": 3902264.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.09684339864179492, "epoch": 0.4860828241683639, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2245018162632421e-05, "loss": 0.0, "num_tokens": 3907118.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.04584557842463255, "epoch": 0.48676171079429736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2221906973308365e-05, "loss": 0.0, "num_tokens": 3911522.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 389.625, "completions/mean_terminated_length": 389.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.12988235335797071, "epoch": 0.4874405974202308, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "learning_rate": 1.2198783293086442e-05, "loss": -0.0, "num_tokens": 3920423.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 97.0, "completions/mean_terminated_length": 97.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.03626663563773036, "epoch": 0.4881194840461643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.217564725196108e-05, "loss": 0.0, "num_tokens": 3924511.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 101.25, "completions/mean_terminated_length": 101.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.050255367532372475, "epoch": 0.48879837067209775, "frac_reward_zero_std": 0.0, "grad_norm": 4.5625, "learning_rate": 1.2152498979996195e-05, "loss": 0.0, "num_tokens": 3928489.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 106.0, "completions/mean_terminated_length": 106.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.10353991389274597, "epoch": 0.4894772572980312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2129338607324468e-05, "loss": 0.0, "num_tokens": 3932633.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 118.125, "completions/mean_terminated_length": 118.125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.06880290480330586, "epoch": 0.49015614392396467, "frac_reward_zero_std": 0.0, "grad_norm": 9.875, "learning_rate": 1.2106166264146598e-05, "loss": -0.0, "num_tokens": 3936754.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.0662290845066309, "epoch": 0.4908350305498982, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "learning_rate": 1.2082982080730583e-05, "loss": 0.0, "num_tokens": 3942129.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.06920376792550087, "epoch": 0.49151391717583165, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "learning_rate": 1.2059786187410984e-05, "loss": -0.0, "num_tokens": 3946980.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.038067944813519716, "epoch": 0.4921928038017651, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2036578714588191e-05, "loss": 0.0, "num_tokens": 3951482.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 164.125, "completions/mean_terminated_length": 164.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.06174566503614187, "epoch": 0.49287169042769857, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "learning_rate": 1.2013359792727688e-05, "loss": -0.0, "num_tokens": 3956019.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.03196876007132232, "epoch": 0.49355057705363203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1990129552359326e-05, "loss": 0.0, "num_tokens": 3964737.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.04362545418553054, "epoch": 0.4942294636795655, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "learning_rate": 1.1966888124076584e-05, "loss": -0.0, "num_tokens": 3971356.0, "reward": 1.875, "reward_std": 0.25495100021362305, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.25495097041130066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06535260938107967, "epoch": 0.49490835030549896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1943635638535827e-05, "loss": 0.0, "num_tokens": 3975748.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.01673401170410216, "epoch": 0.4955872369314325, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.19203722264556e-05, "loss": 0.0, "num_tokens": 3981242.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 107.875, "completions/mean_terminated_length": 107.875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.050440984312444925, "epoch": 0.49626612355736593, "frac_reward_zero_std": 0.0, "grad_norm": 4.28125, "learning_rate": 1.1897098018615854e-05, "loss": 0.0, "num_tokens": 3985297.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.05595476645976305, "epoch": 0.4969450101832994, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "learning_rate": 1.187381314585725e-05, "loss": -0.0, "num_tokens": 3991420.0, "reward": 1.6875, "reward_std": 0.19594092667102814, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.19594095647335052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 259.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.043723273323848844, "epoch": 0.49762389680923286, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "learning_rate": 1.1850517739080381e-05, "loss": -0.0, "num_tokens": 3998047.0, "reward": 2.84375, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.101027418859303, "epoch": 0.4983027834351663, "frac_reward_zero_std": 0.0, "grad_norm": 4.34375, "learning_rate": 1.1827211929245075e-05, "loss": 0.0, "num_tokens": 4002669.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.16557116620242596, "epoch": 0.4989816700610998, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "learning_rate": 1.1803895847369645e-05, "loss": -0.0, "num_tokens": 4007402.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.05493178544566035, "epoch": 0.49966055668703324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1780569624530135e-05, "loss": 0.0, "num_tokens": 4011658.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 106.125, "completions/mean_terminated_length": 106.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.058956142980605364, "epoch": 0.5003394433129668, "frac_reward_zero_std": 0.0, "grad_norm": 5.21875, "learning_rate": 1.1757233391859617e-05, "loss": 0.0, "num_tokens": 4015547.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.04606771096587181, "epoch": 0.5010183299389002, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1733887280547425e-05, "loss": 0.0, "num_tokens": 4020016.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1013864791020751, "epoch": 0.5016972165648337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1710531421838422e-05, "loss": 0.0, "num_tokens": 4024513.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 107.375, "completions/mean_terminated_length": 107.375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.06689020246267319, "epoch": 0.5023761031907671, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "learning_rate": 1.1687165947032285e-05, "loss": -0.0, "num_tokens": 4028660.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 130.25, "completions/mean_terminated_length": 130.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06759583251550794, "epoch": 0.5030549898167006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1663790987482729e-05, "loss": 0.0, "num_tokens": 4032998.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 112.5, "completions/mean_terminated_length": 112.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07074190396815538, "epoch": 0.5037338764426341, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "learning_rate": 1.1640406674596807e-05, "loss": 0.0, "num_tokens": 4037058.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.033243893878534436, "epoch": 0.5044127630685675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1617013139834148e-05, "loss": 0.0, "num_tokens": 4041473.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.033465131651610136, "epoch": 0.505091649694501, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "learning_rate": 1.1593610514706217e-05, "loss": -0.0, "num_tokens": 4048889.0, "reward": 1.9464285373687744, "reward_std": 0.09919504076242447, "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.09919501841068268, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.01844643836375326, "epoch": 0.5057705363204344, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "learning_rate": 1.1570198930775594e-05, "loss": 0.0, "num_tokens": 4055012.0, "reward": 2.3500001430511475, "reward_std": 0.09258199483156204, "rewards/fixed_code_pass_all_test_reward/mean": 0.3500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.05799560155719519, "epoch": 0.506449422946368, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "learning_rate": 1.1546778519655209e-05, "loss": -0.0, "num_tokens": 4060441.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.04173798905685544, "epoch": 0.5071283095723014, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1523349413007633e-05, "loss": 0.0, "num_tokens": 4066203.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 106.125, "completions/mean_terminated_length": 106.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.026790649397298694, "epoch": 0.5078071961982349, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1499911742544304e-05, "loss": 0.0, "num_tokens": 4070876.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 161.0, "completions/mean_terminated_length": 161.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.05811635544523597, "epoch": 0.5084860828241684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1476465640024814e-05, "loss": 0.0, "num_tokens": 4075348.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.05909173237159848, "epoch": 0.5091649694501018, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "learning_rate": 1.1453011237256152e-05, "loss": -0.0, "num_tokens": 4082407.0, "reward": 2.625, "reward_std": 0.4825679659843445, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.48256784677505493, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.03886709990911186, "epoch": 0.5098438560760353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1429548666091969e-05, "loss": 0.0, "num_tokens": 4086770.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 111.5, "completions/mean_terminated_length": 111.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.058395545929670334, "epoch": 0.5105227427019687, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "learning_rate": 1.140607805843184e-05, "loss": 0.0, "num_tokens": 4090790.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 139.125, "completions/mean_terminated_length": 139.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.032309852074831724, "epoch": 0.5112016293279023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1382599546220516e-05, "loss": 0.0, "num_tokens": 4095447.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 92.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.0683764317072928, "epoch": 0.5118805159538357, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1359113261447183e-05, "loss": 0.0, "num_tokens": 4099291.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.03968081600032747, "epoch": 0.5125594025797692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.133561933614473e-05, "loss": 0.0, "num_tokens": 4105764.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.05837314995005727, "epoch": 0.5132382892057027, "frac_reward_zero_std": 0.0, "grad_norm": 3.421875, "learning_rate": 1.1312117902388986e-05, "loss": -0.0, "num_tokens": 4110005.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.11437533749267459, "epoch": 0.5139171758316361, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "learning_rate": 1.1288609092298004e-05, "loss": 0.0, "num_tokens": 4114357.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 177.625, "completions/mean_terminated_length": 177.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.06471283501014113, "epoch": 0.5145960624575696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1265093038031294e-05, "loss": 0.0, "num_tokens": 4119658.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 92.75, "completions/mean_terminated_length": 92.75, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.06100855814293027, "epoch": 0.515274949083503, "frac_reward_zero_std": 0.0, "grad_norm": 3.71875, "learning_rate": 1.1241569871789096e-05, "loss": 0.0, "num_tokens": 4123480.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.0679061939008534, "epoch": 0.5159538357094365, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "learning_rate": 1.1218039725811626e-05, "loss": -0.0, "num_tokens": 4127697.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 183.75, "completions/mean_terminated_length": 183.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.020937865017913282, "epoch": 0.5166327223353699, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1194502732378349e-05, "loss": 0.0, "num_tokens": 4133671.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.015055299969390035, "epoch": 0.5173116089613035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1170959023807216e-05, "loss": 0.0, "num_tokens": 4139978.0, "reward": 2.375, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 309.125, "completions/mean_terminated_length": 309.125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.02946487721055746, "epoch": 0.517990495587237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1147408732453926e-05, "loss": 0.0, "num_tokens": 4147099.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.032556057907640934, "epoch": 0.5186693822131704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.112385199071119e-05, "loss": 0.0, "num_tokens": 4152310.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.036190603859722614, "epoch": 0.5193482688391039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1100288931007982e-05, "loss": 0.0, "num_tokens": 4157656.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.04345138184726238, "epoch": 0.5200271554650373, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "learning_rate": 1.1076719685808786e-05, "loss": 0.0, "num_tokens": 4163178.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.04096515872515738, "epoch": 0.5207060420909708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.105314438761287e-05, "loss": 0.0, "num_tokens": 4168715.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 112.25, "completions/mean_terminated_length": 112.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.028937431052327156, "epoch": 0.5213849287169042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.102956316895352e-05, "loss": 0.0, "num_tokens": 4173229.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.03585202293470502, "epoch": 0.5220638153428377, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1005976162397309e-05, "loss": 0.0, "num_tokens": 4177549.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.030628055799752474, "epoch": 0.5227427019687713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0982383500543355e-05, "loss": 0.0, "num_tokens": 4183226.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.05688394093886018, "epoch": 0.5234215885947047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0958785316022551e-05, "loss": 0.0, "num_tokens": 4187559.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 199.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.030420107068493962, "epoch": 0.5241004752206382, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "learning_rate": 1.0935181741496858e-05, "loss": 0.0, "num_tokens": 4192886.0, "reward": 1.875, "reward_std": 0.10350986570119858, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.1035098284482956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.06591624300926924, "epoch": 0.5247793618465716, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "learning_rate": 1.0911572909658524e-05, "loss": -0.0, "num_tokens": 4199955.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.056005898863077164, "epoch": 0.5254582484725051, "frac_reward_zero_std": 0.0, "grad_norm": 4.0625, "learning_rate": 1.0887958953229349e-05, "loss": -0.0, "num_tokens": 4204130.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.0647927368991077, "epoch": 0.5261371350984385, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "learning_rate": 1.0864340004959957e-05, "loss": -0.0, "num_tokens": 4209320.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.05679185036569834, "epoch": 0.526816021724372, "frac_reward_zero_std": 0.0, "grad_norm": 4.625, "learning_rate": 1.084071619762902e-05, "loss": -0.0, "num_tokens": 4214203.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 94.625, "completions/mean_terminated_length": 94.625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.04140359256416559, "epoch": 0.5274949083503055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0817087664042536e-05, "loss": 0.0, "num_tokens": 4218152.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 126.25, "completions/mean_terminated_length": 126.25, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.09763273131102324, "epoch": 0.528173794976239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0793454537033068e-05, "loss": 0.0, "num_tokens": 4222370.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.75, "completions/mean_terminated_length": 228.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.04697759263217449, "epoch": 0.5288526816021725, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "learning_rate": 1.0769816949459002e-05, "loss": 0.0, "num_tokens": 4228688.0, "reward": 2.6500000953674316, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.6499999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.04557292116805911, "epoch": 0.5295315682281059, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0746175034203799e-05, "loss": 0.0, "num_tokens": 4233057.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 101.0, "completions/mean_terminated_length": 101.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.05933442106470466, "epoch": 0.5302104548540394, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "learning_rate": 1.0722528924175254e-05, "loss": -0.0, "num_tokens": 4237161.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.052812352776527405, "epoch": 0.5308893414799728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0698878752304738e-05, "loss": 0.0, "num_tokens": 4242456.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07757335249334574, "epoch": 0.5315682281059063, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "learning_rate": 1.0675224651546459e-05, "loss": -0.0, "num_tokens": 4247149.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.04613668704405427, "epoch": 0.5322471147318398, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "learning_rate": 1.0651566754876715e-05, "loss": -0.0, "num_tokens": 4251917.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.05881976243108511, "epoch": 0.5329260013577732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0627905195293135e-05, "loss": 0.0, "num_tokens": 4256885.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.04891295824199915, "epoch": 0.5336048879837068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0604240105813948e-05, "loss": 0.0, "num_tokens": 4261520.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 287.875, "completions/mean_terminated_length": 287.875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.05009777005761862, "epoch": 0.5342837746096402, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "learning_rate": 1.0580571619477225e-05, "loss": -0.0, "num_tokens": 4268583.0, "reward": 2.375, "reward_std": 0.5496752262115479, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.36645016074180603, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 258.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.03557685250416398, "epoch": 0.5349626612355737, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "learning_rate": 1.0556899869340127e-05, "loss": -0.0, "num_tokens": 4275069.0, "reward": 2.9791665077209473, "reward_std": 0.058925628662109375, "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 123.125, "completions/mean_terminated_length": 123.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.08843094576150179, "epoch": 0.5356415478615071, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0533224988478176e-05, "loss": 0.0, "num_tokens": 4279182.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.08234369801357388, "epoch": 0.5363204344874406, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "learning_rate": 1.0509547109984484e-05, "loss": 0.0, "num_tokens": 4284333.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.030832044780254364, "epoch": 0.5369993211133741, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "learning_rate": 1.0485866366969012e-05, "loss": 0.0, "num_tokens": 4289875.0, "reward": 2.953125, "reward_std": 0.13258251547813416, "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.06975599378347397, "epoch": 0.5376782077393075, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "learning_rate": 1.0462182892557834e-05, "loss": -0.0, "num_tokens": 4294137.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 100.125, "completions/mean_terminated_length": 100.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.02164299343712628, "epoch": 0.538357094365241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0438496819892376e-05, "loss": 0.0, "num_tokens": 4298146.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 169.25, "completions/mean_terminated_length": 169.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07684734044596553, "epoch": 0.5390359809911744, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "learning_rate": 1.0414808282128668e-05, "loss": -0.0, "num_tokens": 4302668.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06801590882241726, "epoch": 0.539714867617108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.03911174124366e-05, "loss": 0.0, "num_tokens": 4307712.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.057615553960204124, "epoch": 0.5403937542430414, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0367424343999164e-05, "loss": 0.0, "num_tokens": 4312526.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.040979793295264244, "epoch": 0.5410726408689749, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "learning_rate": 1.0343729210011731e-05, "loss": 0.0, "num_tokens": 4317536.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.06204876606352627, "epoch": 0.5417515274949084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0320032143681262e-05, "loss": 0.0, "num_tokens": 4321784.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.055161413038149476, "epoch": 0.5424304141208418, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "learning_rate": 1.0296333278225599e-05, "loss": -0.0, "num_tokens": 4327735.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.0742807905189693, "epoch": 0.5431093007467753, "frac_reward_zero_std": 0.0, "grad_norm": 4.03125, "learning_rate": 1.0272632746872687e-05, "loss": 0.0, "num_tokens": 4332042.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.049888757057487965, "epoch": 0.5437881873727087, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "learning_rate": 1.0248930682859839e-05, "loss": -0.0, "num_tokens": 4337965.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 151.5, "completions/mean_terminated_length": 151.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.050024800933897495, "epoch": 0.5444670739986422, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "learning_rate": 1.0225227219432988e-05, "loss": -0.0, "num_tokens": 4343345.0, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07001071749255061, "epoch": 0.5451459606245757, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "learning_rate": 1.0201522489845927e-05, "loss": 0.0, "num_tokens": 4347694.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 344.625, "completions/mean_terminated_length": 344.625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.032825993141159415, "epoch": 0.5458248472505092, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "learning_rate": 1.0177816627359575e-05, "loss": -0.0, "num_tokens": 4355355.0, "reward": 2.6624999046325684, "reward_std": 0.46579432487487793, "rewards/fixed_code_pass_all_test_reward/mean": 0.6625000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.46579426527023315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.056563244201242924, "epoch": 0.5465037338764427, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0154109765241214e-05, "loss": 0.0, "num_tokens": 4359550.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 151.25, "completions/mean_terminated_length": 151.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.08645900897681713, "epoch": 0.5471826205023761, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "learning_rate": 1.0130402036763747e-05, "loss": -0.0, "num_tokens": 4364032.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.051528535317629576, "epoch": 0.5478615071283096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0106693575204947e-05, "loss": 0.0, "num_tokens": 4369149.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.06195195019245148, "epoch": 0.548540393754243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0082984513846713e-05, "loss": 0.0, "num_tokens": 4374433.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07225760025903583, "epoch": 0.5492192803801765, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "learning_rate": 1.0059274985974305e-05, "loss": 0.0, "num_tokens": 4378959.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 73.875, "completions/mean_terminated_length": 73.875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.04130575666204095, "epoch": 0.5498981670061099, "frac_reward_zero_std": 0.0, "grad_norm": 7.375, "learning_rate": 1.0035565124875623e-05, "loss": -0.0, "num_tokens": 4382598.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.08188450988382101, "epoch": 0.5505770536320435, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "learning_rate": 1.0011855063840416e-05, "loss": -0.0, "num_tokens": 4387088.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 126.125, "completions/mean_terminated_length": 126.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06766916019842029, "epoch": 0.551255940257977, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.988144936159582e-06, "loss": 0.0, "num_tokens": 4392089.0, "reward": 2.875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 142.625, "completions/mean_terminated_length": 142.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07640710193663836, "epoch": 0.5519348268839104, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "learning_rate": 9.96443487512438e-06, "loss": 0.0, "num_tokens": 4396510.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.07171546202152967, "epoch": 0.5526137135098439, "frac_reward_zero_std": 0.0, "grad_norm": 3.625, "learning_rate": 9.940725014025696e-06, "loss": 0.0, "num_tokens": 4401407.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.05010488163679838, "epoch": 0.5532926001357773, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "learning_rate": 9.91701548615329e-06, "loss": 0.0, "num_tokens": 4407464.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.07103995932266116, "epoch": 0.5539714867617108, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "learning_rate": 9.893306424795055e-06, "loss": -0.0, "num_tokens": 4412431.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 339.125, "completions/mean_terminated_length": 339.125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.0642473828047514, "epoch": 0.5546503733876442, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "learning_rate": 9.869597963236253e-06, "loss": -0.0, "num_tokens": 4420120.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 117.75, "completions/mean_terminated_length": 117.75, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.06086629815399647, "epoch": 0.5553292600135777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.845890234758789e-06, "loss": 0.0, "num_tokens": 4424294.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.0363885962869972, "epoch": 0.5560081466395111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.822183372640426e-06, "loss": 0.0, "num_tokens": 4430718.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.048767969477921724, "epoch": 0.5566870332654447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.798477510154075e-06, "loss": 0.0, "num_tokens": 4435102.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 313.375, "completions/mean_terminated_length": 313.375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.04720056499354541, "epoch": 0.5573659198913782, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "learning_rate": 9.774772780567017e-06, "loss": 0.0, "num_tokens": 4442409.0, "reward": 2.5, "reward_std": 1.0690449476242065, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.054787850473076105, "epoch": 0.5580448065173116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.751069317140163e-06, "loss": 0.0, "num_tokens": 4447876.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 101.0, "completions/mean_terminated_length": 101.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.05901277018710971, "epoch": 0.5587236931432451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.727367253127315e-06, "loss": 0.0, "num_tokens": 4452116.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 162.625, "completions/mean_terminated_length": 162.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.052709260024130344, "epoch": 0.5594025797691785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.703666721774403e-06, "loss": 0.0, "num_tokens": 4456617.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.0534027679823339, "epoch": 0.560081466395112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.679967856318741e-06, "loss": 0.0, "num_tokens": 4462128.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.09539056662470102, "epoch": 0.5607603530210454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.656270789988274e-06, "loss": 0.0, "num_tokens": 4466729.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.0635853330604732, "epoch": 0.561439239646979, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "learning_rate": 9.632575656000837e-06, "loss": -0.0, "num_tokens": 4472473.0, "reward": 2.7249999046325684, "reward_std": 0.30118808150291443, "rewards/fixed_code_pass_all_test_reward/mean": 0.7250000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.3011881411075592, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.05248894030228257, "epoch": 0.5621181262729125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.608882587563404e-06, "loss": 0.0, "num_tokens": 4476940.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 231.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.024271421716548502, "epoch": 0.5627970128988459, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.585191717871336e-06, "loss": 0.0, "num_tokens": 4482931.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06745913159102201, "epoch": 0.5634758995247794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.561503180107626e-06, "loss": 0.0, "num_tokens": 4487588.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.046886586118489504, "epoch": 0.5641547861507128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.53781710744217e-06, "loss": 0.0, "num_tokens": 4491962.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.09951631911098957, "epoch": 0.5648336727766463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.514133633030987e-06, "loss": 0.0, "num_tokens": 4496925.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 424.625, "completions/mean_terminated_length": 424.625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "entropy": 0.019229991012252867, "epoch": 0.5655125594025797, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.49045289001552e-06, "loss": 0.0, "num_tokens": 4505978.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 94.25, "completions/mean_terminated_length": 94.25, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.05636278260499239, "epoch": 0.5661914460285132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.466775011521825e-06, "loss": 0.0, "num_tokens": 4509828.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 197.375, "completions/mean_terminated_length": 197.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.03414254495874047, "epoch": 0.5668703326544468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.443100130659876e-06, "loss": 0.0, "num_tokens": 4515431.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.02369208401069045, "epoch": 0.5675492192803802, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "learning_rate": 9.41942838052278e-06, "loss": -0.0, "num_tokens": 4519934.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 96.125, "completions/mean_terminated_length": 96.125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.050153948366642, "epoch": 0.5682281059063137, "frac_reward_zero_std": 0.0, "grad_norm": 4.875, "learning_rate": 9.395759894186054e-06, "loss": 0.0, "num_tokens": 4523679.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.036000629188492894, "epoch": 0.5689069925322471, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.372094804706867e-06, "loss": 0.0, "num_tokens": 4530342.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 162.25, "completions/mean_terminated_length": 162.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.06715370435267687, "epoch": 0.5695858791581806, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.34843324512329e-06, "loss": 0.0, "num_tokens": 4535176.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 126.125, "completions/mean_terminated_length": 126.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.05983061483129859, "epoch": 0.570264765784114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.324775348453543e-06, "loss": 0.0, "num_tokens": 4540121.0, "reward": 1.875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.09151466190814972, "epoch": 0.5709436524100475, "frac_reward_zero_std": 0.0, "grad_norm": 4.28125, "learning_rate": 9.301121247695265e-06, "loss": 0.0, "num_tokens": 4544754.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06438526068814099, "epoch": 0.571622539035981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.277471075824747e-06, "loss": 0.0, "num_tokens": 4549133.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 334.375, "completions/mean_terminated_length": 334.375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.032213454600423574, "epoch": 0.5723014256619144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.253824965796203e-06, "loss": 0.0, "num_tokens": 4556784.0, "reward": 1.875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.09624125668779016, "epoch": 0.572980312287848, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "learning_rate": 9.230183050541001e-06, "loss": 0.0, "num_tokens": 4562246.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.10141824604943395, "epoch": 0.5736591989137814, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "learning_rate": 9.206545462966935e-06, "loss": 0.0, "num_tokens": 4567050.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.04522315924987197, "epoch": 0.5743380855397149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.18291233595747e-06, "loss": 0.0, "num_tokens": 4571261.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.0619788053445518, "epoch": 0.5750169721656483, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "learning_rate": 9.159283802370981e-06, "loss": 0.0, "num_tokens": 4575941.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 67.25, "completions/mean_terminated_length": 67.25, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.03905652277171612, "epoch": 0.5756958587915818, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.135659995040046e-06, "loss": 0.0, "num_tokens": 4579559.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.03857954638078809, "epoch": 0.5763747454175153, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "learning_rate": 9.112041046770653e-06, "loss": 0.0, "num_tokens": 4585185.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.026656872825697064, "epoch": 0.5770536320434487, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.088427090341483e-06, "loss": 0.0, "num_tokens": 4591161.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.041646760888397694, "epoch": 0.5777325186693822, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "learning_rate": 9.064818258503145e-06, "loss": 0.0, "num_tokens": 4595622.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.03903352376073599, "epoch": 0.5784114052953157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.041214683977449e-06, "loss": 0.0, "num_tokens": 4600871.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 230.625, "completions/mean_terminated_length": 230.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.029688776936382055, "epoch": 0.5790902919212492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.017616499456647e-06, "loss": 0.0, "num_tokens": 4606900.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 161.0, "completions/mean_terminated_length": 161.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.07806071871891618, "epoch": 0.5797691785471826, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "learning_rate": 8.994023837602694e-06, "loss": 0.0, "num_tokens": 4611372.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.08304271660745144, "epoch": 0.5804480651731161, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.970436831046484e-06, "loss": 0.0, "num_tokens": 4615703.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 125.875, "completions/mean_terminated_length": 125.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.03566870419308543, "epoch": 0.5811269517990496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.946855612387134e-06, "loss": 0.0, "num_tokens": 4620350.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.068370102904737, "epoch": 0.581805838424983, "frac_reward_zero_std": 0.0, "grad_norm": 4.1875, "learning_rate": 8.923280314191215e-06, "loss": 0.0, "num_tokens": 4625504.0, "reward": 2.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 203.5, "completions/mean_terminated_length": 203.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.08393561094999313, "epoch": 0.5824847250509165, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "learning_rate": 8.899711068992023e-06, "loss": 0.0, "num_tokens": 4630516.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07910141348838806, "epoch": 0.5831636116768499, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.876148009288813e-06, "loss": 0.0, "num_tokens": 4635068.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.04618065990507603, "epoch": 0.5838424983027835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.852591267546077e-06, "loss": 0.0, "num_tokens": 4639160.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 129.25, "completions/mean_terminated_length": 129.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.038697126088663936, "epoch": 0.5845213849287169, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.829040976192789e-06, "loss": 0.0, "num_tokens": 4643394.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.08845762442797422, "epoch": 0.5852002715546504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.805497267621653e-06, "loss": 0.0, "num_tokens": 4648146.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.05717147933319211, "epoch": 0.5858791581805839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.781960274188376e-06, "loss": 0.0, "num_tokens": 4651990.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.02556638070382178, "epoch": 0.5865580448065173, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "learning_rate": 8.758430128210908e-06, "loss": 0.0, "num_tokens": 4657476.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 98.0, "completions/mean_terminated_length": 98.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.0459773913025856, "epoch": 0.5872369314324508, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "learning_rate": 8.734906961968713e-06, "loss": 0.0, "num_tokens": 4661708.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 195.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.05903433682397008, "epoch": 0.5879158180583842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.711390907702001e-06, "loss": 0.0, "num_tokens": 4667233.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.055525312665849924, "epoch": 0.5885947046843177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.687882097611016e-06, "loss": 0.0, "num_tokens": 4672020.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.06552033172920346, "epoch": 0.5892735913102511, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "learning_rate": 8.664380663855272e-06, "loss": 0.0, "num_tokens": 4676268.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 148.625, "completions/mean_terminated_length": 148.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.05239802412688732, "epoch": 0.5899524779361847, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "learning_rate": 8.64088673855282e-06, "loss": -0.0, "num_tokens": 4680905.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 145.125, "completions/mean_terminated_length": 145.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.05006753979250789, "epoch": 0.5906313645621182, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "learning_rate": 8.617400453779487e-06, "loss": 0.0, "num_tokens": 4687674.0, "reward": 2.038461685180664, "reward_std": 0.358450710773468, "rewards/fixed_code_pass_all_test_reward/mean": 0.9134615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.08606166671961546, "epoch": 0.5913102511880516, "frac_reward_zero_std": 0.0, "grad_norm": 3.34375, "learning_rate": 8.593921941568165e-06, "loss": 0.0, "num_tokens": 4693055.0, "reward": 2.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.05833883211016655, "epoch": 0.5919891378139851, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "learning_rate": 8.570451333908033e-06, "loss": -0.0, "num_tokens": 4701636.0, "reward": 2.205357074737549, "reward_std": 0.17677675187587738, "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07068101689219475, "epoch": 0.5926680244399185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.546988762743852e-06, "loss": 0.0, "num_tokens": 4706093.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "entropy": 0.05495765432715416, "epoch": 0.593346911065852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.52353435997519e-06, "loss": 0.0, "num_tokens": 4714314.0, "reward": 2.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.0695712030865252, "epoch": 0.5940257976917854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.5000882574557e-06, "loss": 0.0, "num_tokens": 4718654.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.05437454581260681, "epoch": 0.594704684317719, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.476650586992372e-06, "loss": 0.0, "num_tokens": 4725589.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.053944073617458344, "epoch": 0.5953835709436525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.453221480344791e-06, "loss": 0.0, "num_tokens": 4730121.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.12104702088981867, "epoch": 0.5960624575695859, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.429801069224411e-06, "loss": 0.0, "num_tokens": 4734560.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.06245470978319645, "epoch": 0.5967413441955194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.406389485293786e-06, "loss": 0.0, "num_tokens": 4739151.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 105.5, "completions/mean_terminated_length": 105.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.031613563653081656, "epoch": 0.5974202308214528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.382986860165859e-06, "loss": 0.0, "num_tokens": 4743355.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.11220237705856562, "epoch": 0.5980991174473863, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "learning_rate": 8.359593325403195e-06, "loss": 0.0, "num_tokens": 4748268.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 182.125, "completions/mean_terminated_length": 182.125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.06048412760719657, "epoch": 0.5987780040733197, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "learning_rate": 8.336209012517273e-06, "loss": -0.0, "num_tokens": 4753429.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.028818948660045862, "epoch": 0.5994568906992532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.31283405296772e-06, "loss": 0.0, "num_tokens": 4759128.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 107.375, "completions/mean_terminated_length": 107.375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.05501343263313174, "epoch": 0.6001357773251867, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "learning_rate": 8.289468578161581e-06, "loss": -0.0, "num_tokens": 4763403.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.03536238381639123, "epoch": 0.6008146639511202, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "learning_rate": 8.266112719452579e-06, "loss": 0.0, "num_tokens": 4771328.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 108.5, "completions/mean_terminated_length": 108.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.0552833154797554, "epoch": 0.6014935505770537, "frac_reward_zero_std": 0.0, "grad_norm": 3.734375, "learning_rate": 8.242766608140383e-06, "loss": -0.0, "num_tokens": 4775188.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.03690002113580704, "epoch": 0.6021724372029871, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "learning_rate": 8.219430375469863e-06, "loss": 0.0, "num_tokens": 4781825.0, "reward": 2.8958334922790527, "reward_std": 0.29462775588035583, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 91.25, "completions/mean_terminated_length": 91.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.03606024198234081, "epoch": 0.6028513238289206, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "learning_rate": 8.19610415263036e-06, "loss": -0.0, "num_tokens": 4785563.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.057377586141228676, "epoch": 0.603530210454854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.172788070754927e-06, "loss": 0.0, "num_tokens": 4790019.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 75.375, "completions/mean_terminated_length": 75.375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.04337291792035103, "epoch": 0.6042090970807875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.149482260919625e-06, "loss": 0.0, "num_tokens": 4793686.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.04639269458130002, "epoch": 0.604887983706721, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "learning_rate": 8.126186854142752e-06, "loss": -0.0, "num_tokens": 4801005.0, "reward": 2.8181817531585693, "reward_std": 0.3366619050502777, "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.33666184544563293, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 242.375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.01723166659940034, "epoch": 0.6055668703326544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.102901981384146e-06, "loss": 0.0, "num_tokens": 4807072.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.08192705176770687, "epoch": 0.606245756958588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.079627773544403e-06, "loss": 0.0, "num_tokens": 4812081.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 229.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.040311204735189676, "epoch": 0.6069246435845214, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "learning_rate": 8.056364361464176e-06, "loss": -0.0, "num_tokens": 4818121.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 101.5, "completions/mean_terminated_length": 101.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.044932120479643345, "epoch": 0.6076035302104549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.033111875923421e-06, "loss": 0.0, "num_tokens": 4822285.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 242.75, "completions/mean_terminated_length": 242.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.0368472533300519, "epoch": 0.6082824168363883, "frac_reward_zero_std": 0.0, "grad_norm": 3.625, "learning_rate": 8.009870447640676e-06, "loss": -0.0, "num_tokens": 4828611.0, "reward": 2.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 110.625, "completions/mean_terminated_length": 110.625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.02511978044640273, "epoch": 0.6089613034623218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.986640207272312e-06, "loss": 0.0, "num_tokens": 4832912.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.026552781695500016, "epoch": 0.6096401900882552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.963421285411812e-06, "loss": 0.0, "num_tokens": 4839994.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.06595556903630495, "epoch": 0.6103190767141887, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "learning_rate": 7.940213812589018e-06, "loss": 0.0, "num_tokens": 4847255.0, "reward": 1.3035714626312256, "reward_std": 0.5423063635826111, "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.2159797102212906, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.05004977295175195, "epoch": 0.6109979633401222, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "learning_rate": 7.91701791926942e-06, "loss": 0.0, "num_tokens": 4854500.0, "reward": 2.413461685180664, "reward_std": 0.5248475074768066, "rewards/fixed_code_pass_all_test_reward/mean": 0.9134615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.02198021998628974, "epoch": 0.6116768499660556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.893833735853404e-06, "loss": 0.0, "num_tokens": 4860498.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 92.75, "completions/mean_terminated_length": 92.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.04383000638335943, "epoch": 0.6123557365919892, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.870661392675535e-06, "loss": 0.0, "num_tokens": 4864760.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 318.875, "completions/mean_terminated_length": 318.875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.046564546413719654, "epoch": 0.6130346232179226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.847501020003806e-06, "loss": 0.0, "num_tokens": 4872167.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.03181818872690201, "epoch": 0.6137135098438561, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.824352748038924e-06, "loss": 0.0, "num_tokens": 4877633.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.048190067522227764, "epoch": 0.6143923964697895, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "learning_rate": 7.801216706913563e-06, "loss": -0.0, "num_tokens": 4882591.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 124.625, "completions/mean_terminated_length": 124.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.05867015942931175, "epoch": 0.615071283095723, "frac_reward_zero_std": 0.0, "grad_norm": 3.96875, "learning_rate": 7.778093026691636e-06, "loss": 0.0, "num_tokens": 4886820.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.058279364835470915, "epoch": 0.6157501697216565, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.75498183736758e-06, "loss": 0.0, "num_tokens": 4891496.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.03309832373633981, "epoch": 0.6164290563475899, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.731883268865601e-06, "loss": 0.0, "num_tokens": 4897670.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.04822292411699891, "epoch": 0.6171079429735234, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.70879745103896e-06, "loss": 0.0, "num_tokens": 4902683.0, "reward": 1.875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 74.125, "completions/mean_terminated_length": 74.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.03923192294314504, "epoch": 0.6177868295994569, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.685724513669227e-06, "loss": 0.0, "num_tokens": 4906308.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.029026684118434787, "epoch": 0.6184657162253904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.662664586465574e-06, "loss": 0.0, "num_tokens": 4911101.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 76.25, "completions/mean_terminated_length": 76.25, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.015714952372945845, "epoch": 0.6191446028513238, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "learning_rate": 7.63961779906403e-06, "loss": -0.0, "num_tokens": 4914807.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.018774282885715365, "epoch": 0.6198234894772573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.616584281026759e-06, "loss": 0.0, "num_tokens": 4919152.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.04295201087370515, "epoch": 0.6205023761031908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.593564161841318e-06, "loss": 0.0, "num_tokens": 4923701.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07025658199563622, "epoch": 0.6211812627291242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.57055757091995e-06, "loss": 0.0, "num_tokens": 4928141.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.08920403476804495, "epoch": 0.6218601493550577, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "learning_rate": 7.5475646375988395e-06, "loss": -0.0, "num_tokens": 4932389.0, "reward": 1.0714285373687744, "reward_std": 0.2020304799079895, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0714285746216774, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2020305097103119, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.028667444130405784, "epoch": 0.6225390359809911, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "learning_rate": 7.524585491137404e-06, "loss": 0.0, "num_tokens": 4937239.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.054681753274053335, "epoch": 0.6232179226069247, "frac_reward_zero_std": 0.0, "grad_norm": 5.65625, "learning_rate": 7.501620260717538e-06, "loss": 0.0, "num_tokens": 4941628.0, "reward": 2.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.09519746527075768, "epoch": 0.6238968092328581, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.478669075442917e-06, "loss": 0.0, "num_tokens": 4946055.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 163.5, "completions/mean_terminated_length": 163.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.021858258987776935, "epoch": 0.6245756958587916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.455732064338255e-06, "loss": 0.0, "num_tokens": 4951027.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.04795250901952386, "epoch": 0.6252545824847251, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.432809356348576e-06, "loss": 0.0, "num_tokens": 4957914.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.04438210651278496, "epoch": 0.6259334691106585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.409901080338512e-06, "loss": 0.0, "num_tokens": 4962826.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 99.875, "completions/mean_terminated_length": 99.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.038695571245625615, "epoch": 0.626612355736592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.38700736509155e-06, "loss": 0.0, "num_tokens": 4966681.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 82.125, "completions/mean_terminated_length": 82.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.024893229361623526, "epoch": 0.6272912423625254, "frac_reward_zero_std": 0.0, "grad_norm": 3.078125, "learning_rate": 7.364128339309326e-06, "loss": 0.0, "num_tokens": 4970330.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.08436478301882744, "epoch": 0.6279701289884589, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "learning_rate": 7.34126413161089e-06, "loss": 0.0, "num_tokens": 4976435.0, "reward": 1.640625, "reward_std": 0.23563648760318756, "rewards/fixed_code_pass_all_test_reward/mean": 0.640625, "rewards/fixed_code_pass_all_test_reward/std": 0.23563650250434875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.05935622798278928, "epoch": 0.6286490156143923, "frac_reward_zero_std": 0.0, "grad_norm": 3.40625, "learning_rate": 7.318414870531996e-06, "loss": -0.0, "num_tokens": 4980722.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.061224288772791624, "epoch": 0.6293279022403259, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "learning_rate": 7.2955806845243734e-06, "loss": -0.0, "num_tokens": 4986977.0, "reward": 2.9124999046325684, "reward_std": 0.2474873661994934, "rewards/fixed_code_pass_all_test_reward/mean": 0.9125000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2474873811006546, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.05595715483650565, "epoch": 0.6300067888662594, "frac_reward_zero_std": 0.0, "grad_norm": 3.65625, "learning_rate": 7.272761701955e-06, "loss": -0.0, "num_tokens": 4991317.0, "reward": 2.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 151.75, "completions/mean_terminated_length": 151.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.0885505573824048, "epoch": 0.6306856754921928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.249958051105383e-06, "loss": 0.0, "num_tokens": 4995827.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 131.375, "completions/mean_terminated_length": 131.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06484300037845969, "epoch": 0.6313645621181263, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "learning_rate": 7.227169860170845e-06, "loss": 0.0, "num_tokens": 5000222.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 309.0, "completions/mean_terminated_length": 309.0, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.031262818491086364, "epoch": 0.6320434487440597, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "learning_rate": 7.2043972572597855e-06, "loss": 0.0, "num_tokens": 5007214.0, "reward": 1.8035714626312256, "reward_std": 0.3234066367149353, "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.3234066069126129, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.03661538939923048, "epoch": 0.6327223353699932, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "learning_rate": 7.181640370392994e-06, "loss": 0.0, "num_tokens": 5012050.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.07277259835973382, "epoch": 0.6334012219959266, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "learning_rate": 7.1588993275028885e-06, "loss": 0.0, "num_tokens": 5016254.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.060410027392208576, "epoch": 0.6340801086218602, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "learning_rate": 7.136174256432828e-06, "loss": 0.0, "num_tokens": 5023439.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.1308068847283721, "epoch": 0.6347589952477937, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "learning_rate": 7.113465284936378e-06, "loss": 0.0, "num_tokens": 5028269.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 230.625, "completions/mean_terminated_length": 230.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.0925988806411624, "epoch": 0.6354378818737271, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.090772540676598e-06, "loss": 0.0, "num_tokens": 5034882.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.07168037118390203, "epoch": 0.6361167684996606, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "learning_rate": 7.0680961512253254e-06, "loss": -0.0, "num_tokens": 5040193.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.056642959360033274, "epoch": 0.636795655125594, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.045436244062458e-06, "loss": 0.0, "num_tokens": 5044619.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.08202037401497364, "epoch": 0.6374745417515275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.022792946575222e-06, "loss": 0.0, "num_tokens": 5049160.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.060413535218685865, "epoch": 0.6381534283774609, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "learning_rate": 7.000166386057483e-06, "loss": 0.0, "num_tokens": 5053809.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 102.375, "completions/mean_terminated_length": 102.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.04014542978256941, "epoch": 0.6388323150033944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.977556689709009e-06, "loss": 0.0, "num_tokens": 5057820.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 116.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.04275320563465357, "epoch": 0.639511201629328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.954963984634768e-06, "loss": 0.0, "num_tokens": 5062522.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 113.5, "completions/mean_terminated_length": 113.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07519625313580036, "epoch": 0.6401900882552614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.932388397844204e-06, "loss": 0.0, "num_tokens": 5066638.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.10464443638920784, "epoch": 0.6408689748811949, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.909830056250527e-06, "loss": 0.0, "num_tokens": 5071311.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.08981845155358315, "epoch": 0.6415478615071283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.887289086670004e-06, "loss": 0.0, "num_tokens": 5076578.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.0879927696660161, "epoch": 0.6422267481330618, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "learning_rate": 6.864765615821231e-06, "loss": 0.0, "num_tokens": 5081630.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.04189127474091947, "epoch": 0.6429056347589952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.842259770324447e-06, "loss": 0.0, "num_tokens": 5088749.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.020852818968705833, "epoch": 0.6435845213849287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.819771676700794e-06, "loss": 0.0, "num_tokens": 5095167.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07304209750145674, "epoch": 0.6442634080108622, "frac_reward_zero_std": 0.0, "grad_norm": 3.703125, "learning_rate": 6.797301461371626e-06, "loss": 0.0, "num_tokens": 5102162.0, "reward": 2.857142925262451, "reward_std": 0.2020304650068283, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 384.125, "completions/mean_terminated_length": 384.125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "entropy": 0.02881737658753991, "epoch": 0.6449422946367956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.774849250657784e-06, "loss": 0.0, "num_tokens": 5110963.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 344.375, "completions/mean_terminated_length": 344.375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "entropy": 0.0376912287902087, "epoch": 0.6456211812627292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.752415170778894e-06, "loss": 0.0, "num_tokens": 5118806.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 97.375, "completions/mean_terminated_length": 97.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.08059882931411266, "epoch": 0.6463000678886626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.729999347852665e-06, "loss": 0.0, "num_tokens": 5122641.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07680756133049726, "epoch": 0.6469789545145961, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.707601907894159e-06, "loss": 0.0, "num_tokens": 5127054.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 154.25, "completions/mean_terminated_length": 154.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06024423544295132, "epoch": 0.6476578411405295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.6852229768150976e-06, "loss": 0.0, "num_tokens": 5131408.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 350.875, "completions/mean_terminated_length": 350.875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.056971298065036535, "epoch": 0.648336727766463, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "learning_rate": 6.662862680423153e-06, "loss": -0.0, "num_tokens": 5139223.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 78.0, "completions/mean_terminated_length": 78.0, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.011453297920525074, "epoch": 0.6490156143923965, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.640521144421237e-06, "loss": 0.0, "num_tokens": 5142887.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 122.375, "completions/mean_terminated_length": 122.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.04991039214655757, "epoch": 0.6496945010183299, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.618198494406802e-06, "loss": 0.0, "num_tokens": 5146922.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.046746176201850176, "epoch": 0.6503733876442634, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "learning_rate": 6.595894855871119e-06, "loss": 0.0, "num_tokens": 5153085.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.031448905589058995, "epoch": 0.6510522742701969, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "learning_rate": 6.573610354198587e-06, "loss": -0.0, "num_tokens": 5159418.0, "reward": 2.6875, "reward_std": 0.25877460837364197, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 330.75, "completions/mean_terminated_length": 330.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.05657206545583904, "epoch": 0.6517311608961304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.55134511466603e-06, "loss": 0.0, "num_tokens": 5167200.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07719372538849711, "epoch": 0.6524100475220638, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "learning_rate": 6.52909926244197e-06, "loss": 0.0, "num_tokens": 5172452.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 118.625, "completions/mean_terminated_length": 118.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.055858482141047716, "epoch": 0.6530889341479973, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.506872922585956e-06, "loss": 0.0, "num_tokens": 5177393.0, "reward": 2.875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 102.5, "completions/mean_terminated_length": 102.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.06594048859551549, "epoch": 0.6537678207739308, "frac_reward_zero_std": 0.0, "grad_norm": 10.5, "learning_rate": 6.484666220047835e-06, "loss": -0.0, "num_tokens": 5181301.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 141.625, "completions/mean_terminated_length": 141.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.03669480653479695, "epoch": 0.6544467073998642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.4624792796670624e-06, "loss": 0.0, "num_tokens": 5185666.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.040769357699900866, "epoch": 0.6551255940257977, "frac_reward_zero_std": 0.0, "grad_norm": 3.5625, "learning_rate": 6.440312226171992e-06, "loss": 0.0, "num_tokens": 5190185.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.038649448659271, "epoch": 0.6558044806517311, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "learning_rate": 6.418165184179183e-06, "loss": -0.0, "num_tokens": 5196639.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06659081997349858, "epoch": 0.6564833672776647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.396038278192698e-06, "loss": 0.0, "num_tokens": 5201246.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 111.75, "completions/mean_terminated_length": 111.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.025740576442331076, "epoch": 0.6571622539035981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.3739316326034005e-06, "loss": 0.0, "num_tokens": 5205540.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.04978251946158707, "epoch": 0.6578411405295316, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "learning_rate": 6.35184537168825e-06, "loss": 0.0, "num_tokens": 5209765.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.10360525920987129, "epoch": 0.658520027155465, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "learning_rate": 6.329779619609615e-06, "loss": 0.0, "num_tokens": 5215559.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.07118803868070245, "epoch": 0.6591989137813985, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "learning_rate": 6.307734500414564e-06, "loss": -0.0, "num_tokens": 5221121.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06685220077633858, "epoch": 0.659877800407332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.285710138034187e-06, "loss": 0.0, "num_tokens": 5225542.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 365.0, "completions/mean_terminated_length": 365.0, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.07614701706916094, "epoch": 0.6605566870332654, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "learning_rate": 6.263706656282869e-06, "loss": -0.0, "num_tokens": 5234774.0, "reward": 2.6999998092651367, "reward_std": 0.18516404926776886, "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.18516401946544647, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.0702132573351264, "epoch": 0.6612355736591989, "frac_reward_zero_std": 0.0, "grad_norm": 3.859375, "learning_rate": 6.241724178857621e-06, "loss": 0.0, "num_tokens": 5240232.0, "reward": 1.7000000476837158, "reward_std": 0.10690444707870483, "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.10690449178218842, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.034311452182009816, "epoch": 0.6619144602851323, "frac_reward_zero_std": 0.0, "grad_norm": 4.90625, "learning_rate": 6.219762829337367e-06, "loss": 0.0, "num_tokens": 5244434.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.07672926783561707, "epoch": 0.6625933469110659, "frac_reward_zero_std": 0.0, "grad_norm": 4.0625, "learning_rate": 6.197822731182259e-06, "loss": -0.0, "num_tokens": 5248817.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.04931910941377282, "epoch": 0.6632722335369993, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "learning_rate": 6.1759040077329845e-06, "loss": 0.0, "num_tokens": 5253596.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.017579364706762135, "epoch": 0.6639511201629328, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "learning_rate": 6.154006782210066e-06, "loss": 0.0, "num_tokens": 5259917.0, "reward": 2.359375, "reward_std": 0.04419417306780815, "rewards/fixed_code_pass_all_test_reward/mean": 0.359375, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 81.5, "completions/mean_terminated_length": 81.5, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.05019477568566799, "epoch": 0.6646300067888663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.132131177713165e-06, "loss": 0.0, "num_tokens": 5263665.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 98.625, "completions/mean_terminated_length": 98.625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.07368154264986515, "epoch": 0.6653088934147997, "frac_reward_zero_std": 0.0, "grad_norm": 6.21875, "learning_rate": 6.1102773172204034e-06, "loss": 0.0, "num_tokens": 5267638.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.0644279196858406, "epoch": 0.6659877800407332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.0884453235876615e-06, "loss": 0.0, "num_tokens": 5275468.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.025917271384969354, "epoch": 0.6666666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.066635319547895e-06, "loss": 0.0, "num_tokens": 5280529.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.018838520161807537, "epoch": 0.6673455532926001, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.0448474277104365e-06, "loss": 0.0, "num_tokens": 5286114.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 421.5, "completions/mean_terminated_length": 421.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.0307229021564126, "epoch": 0.6680244399185336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.023081770560307e-06, "loss": 0.0, "num_tokens": 5295438.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.05108774080872536, "epoch": 0.6687033265444671, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.0013384704575406e-06, "loss": 0.0, "num_tokens": 5299791.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 428.5, "completions/mean_terminated_length": 428.5, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "entropy": 0.029764720937237144, "epoch": 0.6693822131704006, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "learning_rate": 5.9796176496364735e-06, "loss": 0.0, "num_tokens": 5309187.0, "reward": 2.1750001907348633, "reward_std": 0.046291034668684006, "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.04629100486636162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.03381944540888071, "epoch": 0.670061099796334, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "learning_rate": 5.957919430205088e-06, "loss": 0.0, "num_tokens": 5316697.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.033371551195159554, "epoch": 0.6707399864222675, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "learning_rate": 5.93624393414429e-06, "loss": -0.0, "num_tokens": 5323071.0, "reward": 1.875, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 192.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.03332230914384127, "epoch": 0.6714188730482009, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.9145912833072535e-06, "loss": 0.0, "num_tokens": 5328632.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 201.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.022967584896832705, "epoch": 0.6720977596741344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.892961599418716e-06, "loss": 0.0, "num_tokens": 5334477.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.875, "completions/mean_terminated_length": 121.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.056876023299992085, "epoch": 0.6727766463000678, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.871355004074304e-06, "loss": 0.0, "num_tokens": 5338668.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 80.125, "completions/mean_terminated_length": 80.125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.033333154395222664, "epoch": 0.6734555329260014, "frac_reward_zero_std": 0.0, "grad_norm": 14.5, "learning_rate": 5.849771618739852e-06, "loss": 0.0, "num_tokens": 5342381.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.05435479525476694, "epoch": 0.6741344195519349, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "learning_rate": 5.82821156475071e-06, "loss": -0.0, "num_tokens": 5348688.0, "reward": 2.8125, "reward_std": 0.1157275140285492, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 0.0387456719763577, "epoch": 0.6748133061778683, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.8066749633110675e-06, "loss": 0.0, "num_tokens": 5356588.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.040065045235678554, "epoch": 0.6754921928038018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.785161935493266e-06, "loss": 0.0, "num_tokens": 5360905.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 237.125, "completions/mean_terminated_length": 237.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.04246202833019197, "epoch": 0.6761710794297352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.763672602237129e-06, "loss": 0.0, "num_tokens": 5367378.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.019586203852668405, "epoch": 0.6768499660556687, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.742207084349274e-06, "loss": 0.0, "num_tokens": 5372831.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06590558681637049, "epoch": 0.6775288526816021, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "learning_rate": 5.72076550250244e-06, "loss": -0.0, "num_tokens": 5378574.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.021161453099921346, "epoch": 0.6782077393075356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.699347977234799e-06, "loss": 0.0, "num_tokens": 5386203.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.055856657680124044, "epoch": 0.6788866259334692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.677954628949281e-06, "loss": 0.0, "num_tokens": 5390592.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1000 }, { "epoch": 0.6788866259334692, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 227.21138211382114, "eval_completions/max_terminated_length": 227.21138211382114, "eval_completions/mean_length": 191.86957994579944, "eval_completions/mean_terminated_length": 191.86957994579944, "eval_completions/min_length": 155.69647696476966, "eval_completions/min_terminated_length": 155.69647696476966, "eval_entropy": 0.056565259366238185, "eval_frac_reward_zero_std": 0.5040650406504065, "eval_num_tokens": 5390592.0, "eval_reward": 2.018510837218949, "eval_reward_std": 0.2031349181159725, "eval_rewards/fixed_code_pass_all_test_reward/mean": 0.6943238458336207, "eval_rewards/fixed_code_pass_all_test_reward/std": 0.11917512425723761, "eval_rewards/format_reward/mean": 0.9888211382113821, "eval_rewards/format_reward/std": 0.016047745781539256, "eval_rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3353658536585366, "eval_rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08650764031461906, "eval_train_loss": 0.003139395033940673, "eval_train_runtime": 1028.3216, "eval_train_samples_per_second": 0.359, "eval_train_steps_per_second": 0.046, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.09460367681458592, "epoch": 0.6795655125594026, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "learning_rate": 5.656585577912908e-06, "loss": -0.0, "num_tokens": 5395086.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.07179732667282224, "epoch": 0.6802443991853361, "frac_reward_zero_std": 0.0, "grad_norm": 3.578125, "learning_rate": 5.635240944256113e-06, "loss": 0.0, "num_tokens": 5400852.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 106.0, "completions/mean_terminated_length": 106.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.029686111956834793, "epoch": 0.6809232858112695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.613920847972052e-06, "loss": 0.0, "num_tokens": 5404852.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.07532089110463858, "epoch": 0.681602172437203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.592625408915939e-06, "loss": 0.0, "num_tokens": 5409955.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 171.875, "completions/mean_terminated_length": 171.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.05466371215879917, "epoch": 0.6822810590631364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.571354746804383e-06, "loss": 0.0, "num_tokens": 5414634.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05604705912992358, "epoch": 0.6829599456890699, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.550108981214692e-06, "loss": 0.0, "num_tokens": 5418667.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06336376816034317, "epoch": 0.6836388323150034, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "learning_rate": 5.5288882315842265e-06, "loss": 0.0, "num_tokens": 5423378.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.0484875850379467, "epoch": 0.6843177189409368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.507692617209701e-06, "loss": 0.0, "num_tokens": 5427729.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.06522973626852036, "epoch": 0.6849966055668704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.486522257246538e-06, "loss": 0.0, "num_tokens": 5432478.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 127.125, "completions/mean_terminated_length": 127.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.05258332472294569, "epoch": 0.6856754921928038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.465377270708183e-06, "loss": 0.0, "num_tokens": 5437183.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.03418240649625659, "epoch": 0.6863543788187373, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "learning_rate": 5.4442577764654334e-06, "loss": 0.0, "num_tokens": 5443582.0, "reward": 2.8541667461395264, "reward_std": 0.20773717761039734, "rewards/fixed_code_pass_all_test_reward/mean": 0.8541666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.20773723721504211, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.05349999386817217, "epoch": 0.6870332654446707, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "learning_rate": 5.423163893245786e-06, "loss": -0.0, "num_tokens": 5448403.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.02258082269690931, "epoch": 0.6877121520706042, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "learning_rate": 5.402095739632763e-06, "loss": -0.0, "num_tokens": 5454536.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.04335205093957484, "epoch": 0.6883910386965377, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.381053434065229e-06, "loss": 0.0, "num_tokens": 5458816.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.052615988068282604, "epoch": 0.6890699253224711, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "learning_rate": 5.360037094836745e-06, "loss": -0.0, "num_tokens": 5463347.0, "reward": 2.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.031195369083434343, "epoch": 0.6897488119484046, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.339046840094899e-06, "loss": 0.0, "num_tokens": 5469902.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.0740518462844193, "epoch": 0.6904276985743381, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.318082787840646e-06, "loss": 0.0, "num_tokens": 5474257.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06399343302473426, "epoch": 0.6911065852002716, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "learning_rate": 5.297145055927622e-06, "loss": 0.0, "num_tokens": 5478505.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.018098314991220832, "epoch": 0.691785471826205, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "learning_rate": 5.276233762061507e-06, "loss": 0.0, "num_tokens": 5484986.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.06366366753354669, "epoch": 0.6924643584521385, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "learning_rate": 5.255349023799357e-06, "loss": -0.0, "num_tokens": 5490229.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.06839914806187153, "epoch": 0.693143245078072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.234490958548941e-06, "loss": 0.0, "num_tokens": 5494047.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06942394003272057, "epoch": 0.6938221317040054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.213659683568073e-06, "loss": 0.0, "num_tokens": 5499116.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.04457291937433183, "epoch": 0.6945010183299389, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.192855315963959e-06, "loss": 0.0, "num_tokens": 5504526.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.07382111297920346, "epoch": 0.6951799049558723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.172077972692553e-06, "loss": 0.0, "num_tokens": 5508978.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.063003228046, "epoch": 0.6958587915818059, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "learning_rate": 5.15132777055787e-06, "loss": 0.0, "num_tokens": 5513962.0, "reward": 2.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.10102774389088154, "epoch": 0.6965376782077393, "frac_reward_zero_std": 0.0, "grad_norm": 4.375, "learning_rate": 5.130604826211361e-06, "loss": 0.0, "num_tokens": 5518305.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 324.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.03406269242987037, "epoch": 0.6972165648336728, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "learning_rate": 5.109909256151227e-06, "loss": 0.0, "num_tokens": 5525776.0, "reward": 2.950000047683716, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.1569840693846345, "epoch": 0.6978954514596063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.089241176721794e-06, "loss": 0.0, "num_tokens": 5530754.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.04225786775350571, "epoch": 0.6985743380855397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.068600704112832e-06, "loss": 0.0, "num_tokens": 5534885.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.05938868597149849, "epoch": 0.6992532247114732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.047987954358912e-06, "loss": 0.0, "num_tokens": 5539392.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 139.375, "completions/mean_terminated_length": 139.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.03899049502797425, "epoch": 0.6999321113374066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.02740304333877e-06, "loss": 0.0, "num_tokens": 5543587.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 98.125, "completions/mean_terminated_length": 98.125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.04210246494039893, "epoch": 0.7006109979633401, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.006846086774631e-06, "loss": 0.0, "num_tokens": 5547580.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.02648665476590395, "epoch": 0.7012898845892735, "frac_reward_zero_std": 0.0, "grad_norm": 4.625, "learning_rate": 4.9863172002315675e-06, "loss": 0.0, "num_tokens": 5552200.0, "reward": 2.9000000953674316, "reward_std": 0.2828427255153656, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.033049843506887555, "epoch": 0.7019687712152071, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "learning_rate": 4.965816499116849e-06, "loss": 0.0, "num_tokens": 5559316.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 211.625, "completions/mean_terminated_length": 211.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.054559324868023396, "epoch": 0.7026476578411406, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.945344098679302e-06, "loss": 0.0, "num_tokens": 5564801.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 205.75, "completions/mean_terminated_length": 205.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.04275689972564578, "epoch": 0.703326544467074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.924900114008656e-06, "loss": 0.0, "num_tokens": 5570343.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 98.375, "completions/mean_terminated_length": 98.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.049867642344906926, "epoch": 0.7040054310930075, "frac_reward_zero_std": 0.0, "grad_norm": 3.859375, "learning_rate": 4.904484660034887e-06, "loss": 0.0, "num_tokens": 5574522.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.05536046577617526, "epoch": 0.7046843177189409, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "learning_rate": 4.8840978515275816e-06, "loss": 0.0, "num_tokens": 5581161.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06532919779419899, "epoch": 0.7053632043448744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.863739803095299e-06, "loss": 0.0, "num_tokens": 5585679.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.025774283800274134, "epoch": 0.7060420909708078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.8434106291849035e-06, "loss": 0.0, "num_tokens": 5591577.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07337108254432678, "epoch": 0.7067209775967414, "frac_reward_zero_std": 0.0, "grad_norm": 3.546875, "learning_rate": 4.8231104440809524e-06, "loss": 0.0, "num_tokens": 5596052.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 246.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.030016270466148853, "epoch": 0.7073998642226749, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "learning_rate": 4.802839361905021e-06, "loss": 0.0, "num_tokens": 5602530.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07069768756628036, "epoch": 0.7080787508486083, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "learning_rate": 4.782597496615088e-06, "loss": -0.0, "num_tokens": 5607417.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06086717499420047, "epoch": 0.7087576374745418, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "learning_rate": 4.762384962004877e-06, "loss": 0.0, "num_tokens": 5612206.0, "reward": 2.3125, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.04599870974197984, "epoch": 0.7094365241004752, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "learning_rate": 4.74220187170322e-06, "loss": 0.0, "num_tokens": 5616765.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.05836607748642564, "epoch": 0.7101154107264087, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.7220483391734325e-06, "loss": 0.0, "num_tokens": 5621081.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 186.875, "completions/mean_terminated_length": 186.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.0899033397436142, "epoch": 0.7107942973523421, "frac_reward_zero_std": 0.0, "grad_norm": 3.90625, "learning_rate": 4.701924477712663e-06, "loss": -0.0, "num_tokens": 5626024.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 323.25, "completions/mean_terminated_length": 323.25, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.033922820119187236, "epoch": 0.7114731839782756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.681830400451249e-06, "loss": 0.0, "num_tokens": 5634474.0, "reward": 2.2857143878936768, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.06572171626612544, "epoch": 0.712152070604209, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "learning_rate": 4.661766220352098e-06, "loss": 0.0, "num_tokens": 5638783.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.049859441351145506, "epoch": 0.7128309572301426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.641732050210032e-06, "loss": 0.0, "num_tokens": 5643278.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 324.125, "completions/mean_terminated_length": 324.125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.030875551281496882, "epoch": 0.7135098438560761, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.621728002651194e-06, "loss": 0.0, "num_tokens": 5650959.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.04553757677786052, "epoch": 0.7141887304820095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6017541901323605e-06, "loss": 0.0, "num_tokens": 5656569.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.05938792508095503, "epoch": 0.714867617107943, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "learning_rate": 4.581810724940343e-06, "loss": 0.0, "num_tokens": 5661121.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 94.375, "completions/mean_terminated_length": 94.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.041229897644370794, "epoch": 0.7155465037338764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.561897719191349e-06, "loss": 0.0, "num_tokens": 5664964.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.03235103120096028, "epoch": 0.7162253903598099, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "learning_rate": 4.542015284830358e-06, "loss": -0.0, "num_tokens": 5672423.0, "reward": 2.6363635063171387, "reward_std": 0.3887436091899872, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.3887436091899872, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.053834905847907066, "epoch": 0.7169042769857433, "frac_reward_zero_std": 0.0, "grad_norm": 6.4375, "learning_rate": 4.5221635336304825e-06, "loss": 0.0, "num_tokens": 5677051.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.040678431978449225, "epoch": 0.7175831636116768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.502342577192342e-06, "loss": 0.0, "num_tokens": 5681977.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.0555617306381464, "epoch": 0.7182620502376104, "frac_reward_zero_std": 0.0, "grad_norm": 5.625, "learning_rate": 4.482552526943432e-06, "loss": 0.0, "num_tokens": 5686280.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "entropy": 0.048259368166327477, "epoch": 0.7189409368635438, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "learning_rate": 4.4627934941375185e-06, "loss": 0.0, "num_tokens": 5694112.0, "reward": 2.107142925262451, "reward_std": 0.2503642439842224, "rewards/fixed_code_pass_all_test_reward/mean": 0.1071428582072258, "rewards/fixed_code_pass_all_test_reward/std": 0.25036418437957764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 225.25, "completions/mean_terminated_length": 225.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.04348608711734414, "epoch": 0.7196198234894773, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "learning_rate": 4.443065589853977e-06, "loss": 0.0, "num_tokens": 5699426.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.026979003101587296, "epoch": 0.7202987101154107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.423368924997208e-06, "loss": 0.0, "num_tokens": 5705233.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.01662625279277563, "epoch": 0.7209775967413442, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.403703610295972e-06, "loss": 0.0, "num_tokens": 5711544.0, "reward": 2.375, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.049123300705105066, "epoch": 0.7216564833672776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3840697563028076e-06, "loss": 0.0, "num_tokens": 5715771.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 120.5, "completions/mean_terminated_length": 120.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06565538328140974, "epoch": 0.7223353699932111, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "learning_rate": 4.36446747339338e-06, "loss": 0.0, "num_tokens": 5719855.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 114.125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.055873916018754244, "epoch": 0.7230142566191446, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "learning_rate": 4.344896871765868e-06, "loss": 0.0, "num_tokens": 5724264.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.05432352190837264, "epoch": 0.723693143245078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.325358061440356e-06, "loss": 0.0, "num_tokens": 5728976.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06329419417306781, "epoch": 0.7243720298710116, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "learning_rate": 4.30585115225821e-06, "loss": 0.0, "num_tokens": 5735285.0, "reward": 2.71875, "reward_std": 0.31160587072372437, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.31160587072372437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.05682699754834175, "epoch": 0.725050916496945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2863762538814465e-06, "loss": 0.0, "num_tokens": 5739539.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.03501487663015723, "epoch": 0.7257298031228785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2669334757921284e-06, "loss": 0.0, "num_tokens": 5747170.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.02034762524999678, "epoch": 0.7264086897488119, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "learning_rate": 4.2475229272917565e-06, "loss": 0.0, "num_tokens": 5753648.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 267.625, "completions/mean_terminated_length": 267.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06554784090258181, "epoch": 0.7270875763747454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.228144717500642e-06, "loss": 0.0, "num_tokens": 5760725.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 113.25, "completions/mean_terminated_length": 113.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.02224722900427878, "epoch": 0.7277664630006789, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.208798955357295e-06, "loss": 0.0, "num_tokens": 5765263.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.1215749466791749, "epoch": 0.7284453496266123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.189485749617813e-06, "loss": 0.0, "num_tokens": 5770041.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.0479441003408283, "epoch": 0.7291242362525459, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "learning_rate": 4.170205208855281e-06, "loss": 0.0, "num_tokens": 5774730.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.031790854409337044, "epoch": 0.7298031228784793, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.150957441459139e-06, "loss": 0.0, "num_tokens": 5779219.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.024800655664876103, "epoch": 0.7304820095044128, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "learning_rate": 4.131742555634597e-06, "loss": 0.0, "num_tokens": 5785099.0, "reward": 1.8928571939468384, "reward_std": 0.06612997502088547, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.06613000482320786, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.07442566379904747, "epoch": 0.7311608961303462, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "learning_rate": 4.112560659401999e-06, "loss": -0.0, "num_tokens": 5789410.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.05083320615813136, "epoch": 0.7318397827562797, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.093411860596253e-06, "loss": 0.0, "num_tokens": 5794159.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 241.0, "completions/mean_terminated_length": 241.0, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.02780906716361642, "epoch": 0.7325186693822132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.0742962668661826e-06, "loss": 0.0, "num_tokens": 5800383.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 281.125, "completions/mean_terminated_length": 281.125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.06524079642258584, "epoch": 0.7331975560081466, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "learning_rate": 4.055213985673949e-06, "loss": -0.0, "num_tokens": 5807816.0, "reward": 2.5357141494750977, "reward_std": 0.05399487912654877, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.05399493873119354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.04657298885285854, "epoch": 0.7338764426340801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.036165124294445e-06, "loss": 0.0, "num_tokens": 5811993.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 174.5, "completions/mean_terminated_length": 174.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.13908050954341888, "epoch": 0.7345553292600135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.017149789814689e-06, "loss": 0.0, "num_tokens": 5816717.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07652442436665297, "epoch": 0.7352342158859471, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.998168089133211e-06, "loss": 0.0, "num_tokens": 5821107.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 418.625, "completions/mean_terminated_length": 418.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.033000505762174726, "epoch": 0.7359131025118805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.979220128959463e-06, "loss": 0.0, "num_tokens": 5830432.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.07138945162296295, "epoch": 0.736591989137814, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "learning_rate": 3.960306015813228e-06, "loss": 0.0, "num_tokens": 5835131.0, "reward": 2.3125, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.03901159903034568, "epoch": 0.7372708757637475, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "learning_rate": 3.941425856024007e-06, "loss": 0.0, "num_tokens": 5839817.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.03441104665398598, "epoch": 0.7379497623896809, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.92257975573042e-06, "loss": 0.0, "num_tokens": 5846835.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.0771292713470757, "epoch": 0.7386286490156144, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "learning_rate": 3.9037678208796144e-06, "loss": 0.0, "num_tokens": 5851747.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.03328163595870137, "epoch": 0.7393075356415478, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "learning_rate": 3.884990157226683e-06, "loss": 0.0, "num_tokens": 5858155.0, "reward": 2.7083334922790527, "reward_std": 0.2920914888381958, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.2920915186405182, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.06203055148944259, "epoch": 0.7399864222674813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.866246870334036e-06, "loss": 0.0, "num_tokens": 5863068.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 112.125, "completions/mean_terminated_length": 112.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.03161480696871877, "epoch": 0.7406653088934148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.847538065570847e-06, "loss": 0.0, "num_tokens": 5867781.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 104.625, "completions/mean_terminated_length": 104.625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.04372584540396929, "epoch": 0.7413441955193483, "frac_reward_zero_std": 0.0, "grad_norm": 6.96875, "learning_rate": 3.828863848112425e-06, "loss": -0.0, "num_tokens": 5871706.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.03522487124428153, "epoch": 0.7420230821452818, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.810224322939655e-06, "loss": 0.0, "num_tokens": 5877237.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.041397218592464924, "epoch": 0.7427019687712152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7916195948383817e-06, "loss": 0.0, "num_tokens": 5881489.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06950572319328785, "epoch": 0.7433808553971487, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7730497683988287e-06, "loss": 0.0, "num_tokens": 5885801.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.04397960612550378, "epoch": 0.7440597420230821, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7545149480150224e-06, "loss": 0.0, "num_tokens": 5889909.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07751855719834566, "epoch": 0.7447386286490156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.736015237884193e-06, "loss": 0.0, "num_tokens": 5894249.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.11444954946637154, "epoch": 0.745417515274949, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "learning_rate": 3.7175507420061885e-06, "loss": -0.0, "num_tokens": 5899944.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 367.375, "completions/mean_terminated_length": 367.375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "entropy": 0.05010857083834708, "epoch": 0.7460964019008826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6991215641828903e-06, "loss": 0.0, "num_tokens": 5908027.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.0829274570569396, "epoch": 0.7467752885268161, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.680727808017638e-06, "loss": 0.0, "num_tokens": 5913347.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.04712094506248832, "epoch": 0.7474541751527495, "frac_reward_zero_std": 0.0, "grad_norm": 4.0, "learning_rate": 3.662369576914642e-06, "loss": 0.0, "num_tokens": 5917617.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.025217097951099277, "epoch": 0.748133061778683, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.644046974078397e-06, "loss": 0.0, "num_tokens": 5923813.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.08944710437208414, "epoch": 0.7488119484046164, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "learning_rate": 3.625760102513103e-06, "loss": -0.0, "num_tokens": 5927948.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.055567214265465736, "epoch": 0.7494908350305499, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "learning_rate": 3.607509065022101e-06, "loss": 0.0, "num_tokens": 5932226.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.06994170090183616, "epoch": 0.7501697216564833, "frac_reward_zero_std": 0.0, "grad_norm": 3.359375, "learning_rate": 3.589293964207271e-06, "loss": -0.0, "num_tokens": 5937593.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.05150497052818537, "epoch": 0.7508486082824168, "frac_reward_zero_std": 0.0, "grad_norm": 4.03125, "learning_rate": 3.57111490246848e-06, "loss": 0.0, "num_tokens": 5942053.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 302.125, "completions/mean_terminated_length": 302.125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.035789793357253075, "epoch": 0.7515274949083504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5529719820029785e-06, "loss": 0.0, "num_tokens": 5949006.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.055672927759587765, "epoch": 0.7522063815342838, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "learning_rate": 3.5348653048048598e-06, "loss": 0.0, "num_tokens": 5954418.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.07923690136522055, "epoch": 0.7528852681602173, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "learning_rate": 3.5167949726644545e-06, "loss": 0.0, "num_tokens": 5959053.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.03581171575933695, "epoch": 0.7535641547861507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4987610871677746e-06, "loss": 0.0, "num_tokens": 5963654.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 119.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.03456205199472606, "epoch": 0.7542430414120842, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "learning_rate": 3.4807637496959433e-06, "loss": -0.0, "num_tokens": 5968331.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.05524680111557245, "epoch": 0.7549219280380176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4628030614246266e-06, "loss": 0.0, "num_tokens": 5972709.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 185.375, "completions/mean_terminated_length": 185.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.056924451142549515, "epoch": 0.7556008146639511, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4448791233234467e-06, "loss": 0.0, "num_tokens": 5977312.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.05375195760279894, "epoch": 0.7562797012898846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4269920361554342e-06, "loss": 0.0, "num_tokens": 5982114.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 320.625, "completions/mean_terminated_length": 320.625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.016106918221339583, "epoch": 0.756958587915818, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "learning_rate": 3.409141900476457e-06, "loss": -0.0, "num_tokens": 5991655.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.01662868820130825, "epoch": 0.7576374745417516, "frac_reward_zero_std": 0.0, "grad_norm": 0.9375, "learning_rate": 3.3913288166346525e-06, "loss": 0.0, "num_tokens": 5998130.0, "reward": 2.953125, "reward_std": 0.13258251547813416, "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.07061495538800955, "epoch": 0.758316361167685, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3735528847698597e-06, "loss": 0.0, "num_tokens": 6002595.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 117.125, "completions/mean_terminated_length": 117.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07740376610308886, "epoch": 0.7589952477936185, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "learning_rate": 3.355814204813058e-06, "loss": -0.0, "num_tokens": 6007004.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.07518411288037896, "epoch": 0.7596741344195519, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "learning_rate": 3.338112876485821e-06, "loss": 0.0, "num_tokens": 6012585.0, "reward": 2.875, "reward_std": 0.10350989550352097, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.1035098284482956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.17964396253228188, "epoch": 0.7603530210454854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3204489992997226e-06, "loss": 0.0, "num_tokens": 6017628.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.049109778832644224, "epoch": 0.7610319076714189, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.302822672555819e-06, "loss": 0.0, "num_tokens": 6022473.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06834228057414293, "epoch": 0.7617107942973523, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "learning_rate": 3.285233995344049e-06, "loss": -0.0, "num_tokens": 6026639.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.04049702361226082, "epoch": 0.7623896809232859, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.267683066542715e-06, "loss": 0.0, "num_tokens": 6031193.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.03817987139336765, "epoch": 0.7630685675492193, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.250169984817897e-06, "loss": 0.0, "num_tokens": 6035989.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.04785697255283594, "epoch": 0.7637474541751528, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "learning_rate": 3.2326948486229105e-06, "loss": 0.0, "num_tokens": 6041808.0, "reward": 2.7750000953674316, "reward_std": 0.4200340211391449, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.4200340509414673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.03661369648762047, "epoch": 0.7644263408010862, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "learning_rate": 3.215257756197758e-06, "loss": 0.0, "num_tokens": 6046563.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.06413457496091723, "epoch": 0.7651052274270197, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "learning_rate": 3.1978588055685733e-06, "loss": -0.0, "num_tokens": 6051155.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 255.75, "completions/mean_terminated_length": 255.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.10283747594803572, "epoch": 0.7657841140529531, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.18049809454706e-06, "loss": 0.0, "num_tokens": 6056937.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.1147898193448782, "epoch": 0.7664630006788866, "frac_reward_zero_std": 0.0, "grad_norm": 3.90625, "learning_rate": 3.163175720729954e-06, "loss": 0.0, "num_tokens": 6061794.0, "reward": 2.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.0754433018155396, "epoch": 0.7671418873048201, "frac_reward_zero_std": 0.0, "grad_norm": 3.65625, "learning_rate": 3.1458917814984657e-06, "loss": 0.0, "num_tokens": 6066681.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 106.0, "completions/mean_terminated_length": 106.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.04454766772687435, "epoch": 0.7678207739307535, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "learning_rate": 3.128646374017754e-06, "loss": 0.0, "num_tokens": 6070777.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "entropy": 0.029199098702520132, "epoch": 0.7684996605566871, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "learning_rate": 3.1114395952363486e-06, "loss": 0.0, "num_tokens": 6078336.0, "reward": 2.625, "reward_std": 0.38917219638824463, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.25717225670814514, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.125, "completions/mean_terminated_length": 123.125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.10223288089036942, "epoch": 0.7691785471826205, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0942715418856184e-06, "loss": 0.0, "num_tokens": 6082425.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.033164044842123985, "epoch": 0.769857433808554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0771423104792454e-06, "loss": 0.0, "num_tokens": 6089073.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 118.125, "completions/mean_terminated_length": 118.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.0335758535657078, "epoch": 0.7705363204344874, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "learning_rate": 3.060051997312646e-06, "loss": 0.0, "num_tokens": 6093226.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.03158295247703791, "epoch": 0.7712152070604209, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "learning_rate": 3.0430006984624704e-06, "loss": -0.0, "num_tokens": 6099881.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 131.75, "completions/mean_terminated_length": 131.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.05773442704230547, "epoch": 0.7718940936863544, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "learning_rate": 3.025988509786023e-06, "loss": 0.0, "num_tokens": 6104207.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.06339313322678208, "epoch": 0.7725729803122878, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0090155269207575e-06, "loss": 0.0, "num_tokens": 6108376.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 79.25, "completions/mean_terminated_length": 79.25, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.015622854698449373, "epoch": 0.7732518669382213, "frac_reward_zero_std": 0.0, "grad_norm": 6.40625, "learning_rate": 2.992081845283715e-06, "loss": 0.0, "num_tokens": 6112354.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.020465551177039742, "epoch": 0.7739307535641547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.975187560070998e-06, "loss": 0.0, "num_tokens": 6116963.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.0760266799479723, "epoch": 0.7746096401900883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.958332766257237e-06, "loss": 0.0, "num_tokens": 6121573.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.05395929981023073, "epoch": 0.7752885268160217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.941517558595056e-06, "loss": 0.0, "num_tokens": 6126011.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 290.375, "completions/mean_terminated_length": 290.375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.040367637760937214, "epoch": 0.7759674134419552, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "learning_rate": 2.9247420316145324e-06, "loss": 0.0, "num_tokens": 6132990.0, "reward": 1.975000023841858, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.04299045028164983, "epoch": 0.7766463000678887, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.908006279622667e-06, "loss": 0.0, "num_tokens": 6140258.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.08779045380651951, "epoch": 0.7773251866938221, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "learning_rate": 2.8913103967028664e-06, "loss": 0.0, "num_tokens": 6144358.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 204.375, "completions/mean_terminated_length": 204.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.027459141332656145, "epoch": 0.7780040733197556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8746544767144056e-06, "loss": 0.0, "num_tokens": 6150289.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.04449179023504257, "epoch": 0.778682959945689, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8580386132918916e-06, "loss": 0.0, "num_tokens": 6155850.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.06664275424554944, "epoch": 0.7793618465716226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.841462899844749e-06, "loss": 0.0, "num_tokens": 6160799.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.0406491719186306, "epoch": 0.780040733197556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8249274295566863e-06, "loss": 0.0, "num_tokens": 6167507.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 299.625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.05521875433623791, "epoch": 0.7807196198234895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8084322953851963e-06, "loss": 0.0, "num_tokens": 6174848.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.07622930407524109, "epoch": 0.781398506449423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.79197759006099e-06, "loss": 0.0, "num_tokens": 6179548.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.08072125958278775, "epoch": 0.7820773930753564, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "learning_rate": 2.7755634060875135e-06, "loss": -0.0, "num_tokens": 6183798.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.09614286851137877, "epoch": 0.7827562797012899, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "learning_rate": 2.7591898357404066e-06, "loss": -0.0, "num_tokens": 6189283.0, "reward": 2.9479165077209473, "reward_std": 0.14731398224830627, "rewards/fixed_code_pass_all_test_reward/mean": 0.9479166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.1473139226436615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.06191416038200259, "epoch": 0.7834351663272233, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "learning_rate": 2.742856971066996e-06, "loss": -0.0, "num_tokens": 6194438.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.04340082639828324, "epoch": 0.7841140529531568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7265649038857776e-06, "loss": 0.0, "num_tokens": 6199062.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 80.875, "completions/mean_terminated_length": 80.875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.05926245450973511, "epoch": 0.7847929395790902, "frac_reward_zero_std": 0.0, "grad_norm": 6.125, "learning_rate": 2.7103137257858867e-06, "loss": -0.0, "num_tokens": 6202733.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 109.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.04237109562382102, "epoch": 0.7854718262050238, "frac_reward_zero_std": 0.0, "grad_norm": 4.21875, "learning_rate": 2.6941035281265936e-06, "loss": -0.0, "num_tokens": 6207007.0, "reward": 2.8499999046325684, "reward_std": 0.2777459919452667, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2777460217475891, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 94.875, "completions/mean_terminated_length": 94.875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.04313861997798085, "epoch": 0.7861507128309573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.677934402036797e-06, "loss": 0.0, "num_tokens": 6211062.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.03325886162929237, "epoch": 0.7868295994568907, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6618064384144925e-06, "loss": 0.0, "num_tokens": 6215757.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.027541066287085414, "epoch": 0.7875084860828242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6457197279262835e-06, "loss": 0.0, "num_tokens": 6221766.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 135.75, "completions/mean_terminated_length": 135.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.043505198787897825, "epoch": 0.7881873727087576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.629674361006851e-06, "loss": 0.0, "num_tokens": 6225972.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 183.75, "completions/mean_terminated_length": 183.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.07034529093652964, "epoch": 0.7888662593346911, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "learning_rate": 2.6136704278584624e-06, "loss": 0.0, "num_tokens": 6230802.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 115.5, "completions/mean_terminated_length": 115.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.05324462801218033, "epoch": 0.7895451459606245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.597708018450453e-06, "loss": 0.0, "num_tokens": 6235102.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.038038093596696854, "epoch": 0.790224032586558, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "learning_rate": 2.58178722251872e-06, "loss": 0.0, "num_tokens": 6239670.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.0353456602897495, "epoch": 0.7909029192124916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5659081295652298e-06, "loss": 0.0, "num_tokens": 6244194.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06646355940029025, "epoch": 0.791581805838425, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "learning_rate": 2.550070828857506e-06, "loss": -0.0, "num_tokens": 6249279.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 118.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.0543752028606832, "epoch": 0.7922606924643585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5342754094281253e-06, "loss": 0.0, "num_tokens": 6253499.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 229.875, "completions/mean_terminated_length": 229.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.05022946512326598, "epoch": 0.7929395790902919, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "learning_rate": 2.518521960074217e-06, "loss": 0.0, "num_tokens": 6259562.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 171.875, "completions/mean_terminated_length": 171.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.043414485175162554, "epoch": 0.7936184657162254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.502810569356976e-06, "loss": 0.0, "num_tokens": 6264297.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.05755961430259049, "epoch": 0.7942973523421588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4871413256011534e-06, "loss": 0.0, "num_tokens": 6269549.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.04200444184243679, "epoch": 0.7949762389680923, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "learning_rate": 2.471514316894559e-06, "loss": -0.0, "num_tokens": 6275292.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.021738025126978755, "epoch": 0.7956551255940258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.455929631087568e-06, "loss": 0.0, "num_tokens": 6279734.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 135.875, "completions/mean_terminated_length": 135.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.033521171659231186, "epoch": 0.7963340122199593, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "learning_rate": 2.440387355792638e-06, "loss": 0.0, "num_tokens": 6284221.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.07864878699183464, "epoch": 0.7970128988458928, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "learning_rate": 2.424887578383799e-06, "loss": -0.0, "num_tokens": 6289326.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "entropy": 0.025022073416039348, "epoch": 0.7976917854718262, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "learning_rate": 2.4094303859961774e-06, "loss": 0.0, "num_tokens": 6296914.0, "reward": 2.8125, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 204.125, "completions/mean_terminated_length": 204.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.020753496093675494, "epoch": 0.7983706720977597, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.394015865525491e-06, "loss": 0.0, "num_tokens": 6302883.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 98.0, "completions/mean_terminated_length": 98.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.035069910110905766, "epoch": 0.7990495587236931, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3786441036275764e-06, "loss": 0.0, "num_tokens": 6306899.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.07078630151227117, "epoch": 0.7997284453496266, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3633151867178915e-06, "loss": 0.0, "num_tokens": 6311303.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.09753914177417755, "epoch": 0.8004073319755601, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "learning_rate": 2.3480292009710282e-06, "loss": -0.0, "num_tokens": 6317148.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.01735112862661481, "epoch": 0.8010862186014935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3327862323202377e-06, "loss": 0.0, "num_tokens": 6323319.0, "reward": 2.375, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.08102944400161505, "epoch": 0.801765105227427, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "learning_rate": 2.3175863664569454e-06, "loss": 0.0, "num_tokens": 6327775.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 103.5, "completions/mean_terminated_length": 103.5, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.058857130352407694, "epoch": 0.8024439918533605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3024296888302565e-06, "loss": 0.0, "num_tokens": 6331731.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.03190521849319339, "epoch": 0.803122878479294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2873162846464868e-06, "loss": 0.0, "num_tokens": 6336996.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.11243955511599779, "epoch": 0.8038017651052274, "frac_reward_zero_std": 0.0, "grad_norm": 3.296875, "learning_rate": 2.272246238868687e-06, "loss": -0.0, "num_tokens": 6342011.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 311.25, "completions/mean_terminated_length": 311.25, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.04063680907711387, "epoch": 0.8044806517311609, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2572196362161592e-06, "loss": 0.0, "num_tokens": 6349269.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07085582660511136, "epoch": 0.8051595383570944, "frac_reward_zero_std": 0.0, "grad_norm": 4.1875, "learning_rate": 2.242236561163976e-06, "loss": -0.0, "num_tokens": 6353635.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.0492614540271461, "epoch": 0.8058384249830278, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "learning_rate": 2.227297097942511e-06, "loss": -0.0, "num_tokens": 6359085.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 107.75, "completions/mean_terminated_length": 107.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.030527201481163502, "epoch": 0.8065173116089613, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.212401330536973e-06, "loss": 0.0, "num_tokens": 6363091.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.03725535562261939, "epoch": 0.8071961982348947, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "learning_rate": 2.1975493426869155e-06, "loss": 0.0, "num_tokens": 6369549.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 264.5, "completions/mean_terminated_length": 264.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.03545244783163071, "epoch": 0.8078750848608283, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "learning_rate": 2.1827412178857866e-06, "loss": 0.0, "num_tokens": 6376537.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.07140312995761633, "epoch": 0.8085539714867617, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "learning_rate": 2.167977039380439e-06, "loss": 0.0, "num_tokens": 6381464.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.0885469326749444, "epoch": 0.8092328581126952, "frac_reward_zero_std": 0.0, "grad_norm": 4.03125, "learning_rate": 2.153256890170683e-06, "loss": 0.0, "num_tokens": 6385690.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.05450877780094743, "epoch": 0.8099117447386287, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "learning_rate": 2.1385808530088024e-06, "loss": 0.0, "num_tokens": 6390402.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.07823984464630485, "epoch": 0.8105906313645621, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "learning_rate": 2.1239490103990946e-06, "loss": -0.0, "num_tokens": 6395244.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.03320121788419783, "epoch": 0.8112695179904956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.109361444597414e-06, "loss": 0.0, "num_tokens": 6399733.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.06206024717539549, "epoch": 0.811948404616429, "frac_reward_zero_std": 0.0, "grad_norm": 3.71875, "learning_rate": 2.0948182376107063e-06, "loss": 0.0, "num_tokens": 6404313.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.054358861874789, "epoch": 0.8126272912423625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0803194711965356e-06, "loss": 0.0, "num_tokens": 6409145.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 121.0, "completions/mean_terminated_length": 121.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.057942730374634266, "epoch": 0.813306177868296, "frac_reward_zero_std": 0.0, "grad_norm": 5.15625, "learning_rate": 2.0658652268626402e-06, "loss": 0.0, "num_tokens": 6413425.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.05780556984245777, "epoch": 0.8139850644942295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0514555858664663e-06, "loss": 0.0, "num_tokens": 6417621.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.05955437617376447, "epoch": 0.814663951120163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.037090629214721e-06, "loss": 0.0, "num_tokens": 6422271.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 108.875, "completions/mean_terminated_length": 108.875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.06297881994396448, "epoch": 0.8153428377460964, "frac_reward_zero_std": 0.0, "grad_norm": 3.4375, "learning_rate": 2.0227704376628987e-06, "loss": -0.0, "num_tokens": 6426270.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.05374971451237798, "epoch": 0.8160217243720299, "frac_reward_zero_std": 0.0, "grad_norm": 5.0, "learning_rate": 2.0084950917148403e-06, "loss": 0.0, "num_tokens": 6430684.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.10325939115136862, "epoch": 0.8167006109979633, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "learning_rate": 1.9942646716222867e-06, "loss": -0.0, "num_tokens": 6435323.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.051525934133678675, "epoch": 0.8173794976238968, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "learning_rate": 1.980079257384405e-06, "loss": 0.0, "num_tokens": 6440224.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.07622804772108793, "epoch": 0.8180583842498302, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9659389287473675e-06, "loss": 0.0, "num_tokens": 6444860.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.0970824770629406, "epoch": 0.8187372708757638, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "learning_rate": 1.9518437652038757e-06, "loss": 0.0, "num_tokens": 6451179.0, "reward": 1.4375, "reward_std": 0.6196196675300598, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.3133915960788727, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.056854731403291225, "epoch": 0.8194161575016972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.937793845992737e-06, "loss": 0.0, "num_tokens": 6455634.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.030172571539878845, "epoch": 0.8200950441276307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9237892500984022e-06, "loss": 0.0, "num_tokens": 6461823.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 101.5, "completions/mean_terminated_length": 101.5, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.041986153926700354, "epoch": 0.8207739307535642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9098300562505266e-06, "loss": 0.0, "num_tokens": 6465747.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 199.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.052354331593960524, "epoch": 0.8214528173794976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.895916342923534e-06, "loss": 0.0, "num_tokens": 6471058.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.055486186407506466, "epoch": 0.8221317040054311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.882048188336172e-06, "loss": 0.0, "num_tokens": 6475308.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 179.25, "completions/mean_terminated_length": 179.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.0853898017667234, "epoch": 0.8228105906313645, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "learning_rate": 1.8682256704510625e-06, "loss": 0.0, "num_tokens": 6480182.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.05451425025239587, "epoch": 0.823489477257298, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "learning_rate": 1.8544488669742755e-06, "loss": 0.0, "num_tokens": 6485453.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.0478174164891243, "epoch": 0.8241683638832314, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "learning_rate": 1.8407178553548876e-06, "loss": 0.0, "num_tokens": 6493336.0, "reward": 2.625, "reward_std": 0.43404853343963623, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.039811473339796066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.02480868436396122, "epoch": 0.824847250509165, "frac_reward_zero_std": 0.0, "grad_norm": 3.828125, "learning_rate": 1.8270327127845534e-06, "loss": -0.0, "num_tokens": 6499508.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.05973181268200278, "epoch": 0.8255261371350985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8133935161970561e-06, "loss": 0.0, "num_tokens": 6503732.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 119.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.05151199037209153, "epoch": 0.8262050237610319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7998003422678867e-06, "loss": 0.0, "num_tokens": 6508017.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06716850865632296, "epoch": 0.8268839103869654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7862532674138166e-06, "loss": 0.0, "num_tokens": 6512704.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.02405495452694595, "epoch": 0.8275627970128988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.772752367792452e-06, "loss": 0.0, "num_tokens": 6517136.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.03884179727174342, "epoch": 0.8282416836388323, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "learning_rate": 1.7592977193018268e-06, "loss": -0.0, "num_tokens": 6522502.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 212.125, "completions/mean_terminated_length": 212.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.059142252430319786, "epoch": 0.8289205702647657, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "learning_rate": 1.745889397579954e-06, "loss": 0.0, "num_tokens": 6527423.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.08818131405860186, "epoch": 0.8295994568906992, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "learning_rate": 1.732527478004422e-06, "loss": 0.0, "num_tokens": 6531551.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.030679827090352774, "epoch": 0.8302783435166328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7192120356919517e-06, "loss": 0.0, "num_tokens": 6537632.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 351.25, "completions/mean_terminated_length": 351.25, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "entropy": 0.04530319105833769, "epoch": 0.8309572301425662, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7059431454979825e-06, "loss": 0.0, "num_tokens": 6546074.0, "reward": 2.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.07944596000015736, "epoch": 0.8316361167684997, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6927208820162589e-06, "loss": 0.0, "num_tokens": 6550684.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.11907055135816336, "epoch": 0.8323150033944331, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "learning_rate": 1.6795453195784017e-06, "loss": 0.0, "num_tokens": 6556560.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 157.375, "completions/mean_terminated_length": 157.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06162241520360112, "epoch": 0.8329938900203666, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "learning_rate": 1.6664165322534887e-06, "loss": -0.0, "num_tokens": 6561035.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 97.625, "completions/mean_terminated_length": 97.625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.04993098718114197, "epoch": 0.8336727766463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.653334593847643e-06, "loss": 0.0, "num_tokens": 6564856.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 131.75, "completions/mean_terminated_length": 131.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.04275673255324364, "epoch": 0.8343516632722335, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "learning_rate": 1.6402995779036146e-06, "loss": -0.0, "num_tokens": 6569446.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 364.25, "completions/mean_terminated_length": 364.25, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 0.021586764021776617, "epoch": 0.835030549898167, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "learning_rate": 1.6273115577003806e-06, "loss": 0.0, "num_tokens": 6577832.0, "reward": 2.075000047683716, "reward_std": 0.384522020816803, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.0508713866584003, "epoch": 0.8357094365241005, "frac_reward_zero_std": 0.0, "grad_norm": 4.34375, "learning_rate": 1.6143706062527108e-06, "loss": 0.0, "num_tokens": 6583372.0, "reward": 2.0, "reward_std": 0.8141603469848633, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.0736389528028667, "epoch": 0.836388323150034, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "learning_rate": 1.6014767963107715e-06, "loss": -0.0, "num_tokens": 6587711.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06715327175334096, "epoch": 0.8370672097759674, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "learning_rate": 1.588630200359711e-06, "loss": 0.0, "num_tokens": 6592847.0, "reward": 1.875, "reward_std": 0.06681530922651291, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.058211774099618196, "epoch": 0.8377460964019009, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "learning_rate": 1.575830890619261e-06, "loss": -0.0, "num_tokens": 6597239.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.047470828518271446, "epoch": 0.8384249830278343, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.563078939043322e-06, "loss": 0.0, "num_tokens": 6601780.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 297.875, "completions/mean_terminated_length": 297.875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.050020713126286864, "epoch": 0.8391038696537678, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5503744173195568e-06, "loss": 0.0, "num_tokens": 6608883.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.09777758829295635, "epoch": 0.8397827562797013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5377173968689985e-06, "loss": 0.0, "num_tokens": 6613596.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.0899508735165, "epoch": 0.8404616429056347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5251079488456367e-06, "loss": 0.0, "num_tokens": 6618491.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.07908450113609433, "epoch": 0.8411405295315683, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "learning_rate": 1.5125461441360223e-06, "loss": 0.0, "num_tokens": 6623294.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 141.625, "completions/mean_terminated_length": 141.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07393840188160539, "epoch": 0.8418194161575017, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.500032053358874e-06, "loss": 0.0, "num_tokens": 6627603.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.03621608763933182, "epoch": 0.8424983027834352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4875657468646788e-06, "loss": 0.0, "num_tokens": 6633832.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 140.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.036038106540217996, "epoch": 0.8431771894093686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4751472947352884e-06, "loss": 0.0, "num_tokens": 6638223.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 353.125, "completions/mean_terminated_length": 353.125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.047747176606208086, "epoch": 0.8438560760353021, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "learning_rate": 1.4627767667835336e-06, "loss": -0.0, "num_tokens": 6646968.0, "reward": 2.7142856121063232, "reward_std": 0.3581618070602417, "rewards/fixed_code_pass_all_test_reward/mean": 0.8392857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.22180677950382233, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.04761378560215235, "epoch": 0.8445349626612356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.450454232552826e-06, "loss": 0.0, "num_tokens": 6651836.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.022800437407568097, "epoch": 0.845213849287169, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4381797613167859e-06, "loss": 0.0, "num_tokens": 6656619.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.05551019869744778, "epoch": 0.8458927359131025, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "learning_rate": 1.4259534220788207e-06, "loss": 0.0, "num_tokens": 6661183.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07395130163058639, "epoch": 0.846571622539036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4137752835717622e-06, "loss": 0.0, "num_tokens": 6665602.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 91.875, "completions/mean_terminated_length": 91.875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.04875294351950288, "epoch": 0.8472505091649695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4016454142574676e-06, "loss": 0.0, "num_tokens": 6669561.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.027896488085389137, "epoch": 0.8479293957909029, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "learning_rate": 1.3895638823264447e-06, "loss": 0.0, "num_tokens": 6676957.0, "reward": 1.8055555820465088, "reward_std": 0.3293411433696747, "rewards/fixed_code_pass_all_test_reward/mean": 0.6805555820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.28752732276916504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.03768895659595728, "epoch": 0.8486082824168364, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "learning_rate": 1.3775307556974616e-06, "loss": -0.0, "num_tokens": 6681804.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 268.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.031340275425463915, "epoch": 0.8492871690427699, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3655461020171635e-06, "loss": 0.0, "num_tokens": 6688331.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.04886147053912282, "epoch": 0.8499660556687033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3536099886596933e-06, "loss": 0.0, "num_tokens": 6693146.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 197.0, "completions/mean_terminated_length": 197.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.052328506018966436, "epoch": 0.8506449422946368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3417224827263232e-06, "loss": 0.0, "num_tokens": 6698690.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06357053900137544, "epoch": 0.8513238289205702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3298836510450597e-06, "loss": 0.0, "num_tokens": 6703055.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 118.125, "completions/mean_terminated_length": 118.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.053554283920675516, "epoch": 0.8520027155465038, "frac_reward_zero_std": 0.0, "grad_norm": 3.078125, "learning_rate": 1.3180935601702838e-06, "loss": -0.0, "num_tokens": 6707240.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.047173636965453625, "epoch": 0.8526816021724372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3063522763823655e-06, "loss": 0.0, "num_tokens": 6713002.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.09581214655190706, "epoch": 0.8533604887983707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2946598656873e-06, "loss": 0.0, "num_tokens": 6717530.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 126.75, "completions/mean_terminated_length": 126.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.018327789497561753, "epoch": 0.8540393754243042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2830163938163298e-06, "loss": 0.0, "num_tokens": 6722056.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 113.5, "completions/mean_terminated_length": 113.5, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.03673034347593784, "epoch": 0.8547182620502376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2714219262255777e-06, "loss": 0.0, "num_tokens": 6726796.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 243.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.10153366811573505, "epoch": 0.8553971486761711, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2598765280956793e-06, "loss": 0.0, "num_tokens": 6732488.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.09755927696824074, "epoch": 0.8560760353021045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2483802643314224e-06, "loss": 0.0, "num_tokens": 6737282.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 155.0, "completions/mean_terminated_length": 155.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06965050008147955, "epoch": 0.856754921928038, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "learning_rate": 1.2369331995613664e-06, "loss": 0.0, "num_tokens": 6741754.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.04192041279748082, "epoch": 0.8574338085539714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2255353981374906e-06, "loss": 0.0, "num_tokens": 6747199.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.09490544628351927, "epoch": 0.858112695179905, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "learning_rate": 1.214186924134838e-06, "loss": -0.0, "num_tokens": 6751296.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.04287162306718528, "epoch": 0.8587915818058385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2028878413511413e-06, "loss": 0.0, "num_tokens": 6756851.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.03735964884981513, "epoch": 0.8594704684317719, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1916382133064707e-06, "loss": 0.0, "num_tokens": 6764007.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.061605677008628845, "epoch": 0.8601493550577054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.180438103242877e-06, "loss": 0.0, "num_tokens": 6768343.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.07917618751525879, "epoch": 0.8608282416836388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1692875741240384e-06, "loss": 0.0, "num_tokens": 6773296.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.07407456485088915, "epoch": 0.8615071283095723, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "learning_rate": 1.158186688634898e-06, "loss": 0.0, "num_tokens": 6778153.0, "reward": 2.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.0826310757547617, "epoch": 0.8621860149355057, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "learning_rate": 1.1471355091813251e-06, "loss": 0.0, "num_tokens": 6782714.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.04163091070950031, "epoch": 0.8628649015614392, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "learning_rate": 1.1361340978897483e-06, "loss": -0.0, "num_tokens": 6787648.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 310.875, "completions/mean_terminated_length": 310.875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.02888157172128558, "epoch": 0.8635437881873728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.125182516606823e-06, "loss": 0.0, "num_tokens": 6794759.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.03162820101715624, "epoch": 0.8642226748133062, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1142808268990691e-06, "loss": 0.0, "num_tokens": 6800029.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.05688911909237504, "epoch": 0.8649015614392397, "frac_reward_zero_std": 0.0, "grad_norm": 6.8125, "learning_rate": 1.1034290900525279e-06, "loss": -0.0, "num_tokens": 6804680.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 98.75, "completions/mean_terminated_length": 98.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.044836345594376326, "epoch": 0.8655804480651731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0926273670724296e-06, "loss": 0.0, "num_tokens": 6808614.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.07390343258157372, "epoch": 0.8662593346911066, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "learning_rate": 1.0818757186828388e-06, "loss": 0.0, "num_tokens": 6813712.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 181.5, "completions/mean_terminated_length": 181.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.048651386983692646, "epoch": 0.86693822131704, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "learning_rate": 1.0711742053263107e-06, "loss": 0.0, "num_tokens": 6818452.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 414.5, "completions/mean_terminated_length": 414.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.01869669440202415, "epoch": 0.8676171079429735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0605228871635586e-06, "loss": 0.0, "num_tokens": 6827392.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 277.5, "completions/mean_terminated_length": 277.5, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.03080414840951562, "epoch": 0.868295994568907, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "learning_rate": 1.0499218240731157e-06, "loss": -0.0, "num_tokens": 6834132.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07403090968728065, "epoch": 0.8689748811948405, "frac_reward_zero_std": 0.0, "grad_norm": 3.96875, "learning_rate": 1.039371075650998e-06, "loss": -0.0, "num_tokens": 6838425.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.05667120357975364, "epoch": 0.869653767820774, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0288707012103593e-06, "loss": 0.0, "num_tokens": 6842677.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.07893756218254566, "epoch": 0.8703326544467074, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "learning_rate": 1.0184207597811724e-06, "loss": -0.0, "num_tokens": 6847298.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 129.625, "completions/mean_terminated_length": 129.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.036208047065883875, "epoch": 0.8710115410726409, "frac_reward_zero_std": 0.0, "grad_norm": 3.9375, "learning_rate": 1.0080213101098891e-06, "loss": -0.0, "num_tokens": 6851495.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.018495872616767883, "epoch": 0.8716904276985743, "frac_reward_zero_std": 0.0, "grad_norm": 5.09375, "learning_rate": 9.976724106591128e-07, "loss": 0.0, "num_tokens": 6855003.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 311.5, "completions/mean_terminated_length": 311.5, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.04359985748305917, "epoch": 0.8723693143245078, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "learning_rate": 9.873741196072683e-07, "loss": -0.0, "num_tokens": 6862047.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.04085562261752784, "epoch": 0.8730482009504412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.771264948482695e-07, "loss": 0.0, "num_tokens": 6868785.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.07690457534044981, "epoch": 0.8737270875763747, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "learning_rate": 9.669295939912106e-07, "loss": 0.0, "num_tokens": 6873588.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.05118082510307431, "epoch": 0.8744059742023083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.567834743600202e-07, "loss": 0.0, "num_tokens": 6878277.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07870891969650984, "epoch": 0.8750848608282417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.466881929931582e-07, "loss": 0.0, "num_tokens": 6882663.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.05558400508016348, "epoch": 0.8757637474541752, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "learning_rate": 9.366438066432804e-07, "loss": 0.0, "num_tokens": 6887555.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.10110961738973856, "epoch": 0.8764426340801086, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "learning_rate": 9.266503717769315e-07, "loss": -0.0, "num_tokens": 6892116.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 257.375, "completions/mean_terminated_length": 257.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.025766295846551657, "epoch": 0.8771215207060421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.167079445742188e-07, "loss": 0.0, "num_tokens": 6898431.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 81.125, "completions/mean_terminated_length": 81.125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.043448422104120255, "epoch": 0.8778004073319755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.06816580928499e-07, "loss": 0.0, "num_tokens": 6902160.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.03499178518541157, "epoch": 0.878479293957909, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.969763364460682e-07, "loss": 0.0, "num_tokens": 6906350.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 72.5, "completions/mean_terminated_length": 72.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.03780035534873605, "epoch": 0.8791581805838425, "frac_reward_zero_std": 0.0, "grad_norm": 4.25, "learning_rate": 8.871872664458459e-07, "loss": -0.0, "num_tokens": 6910074.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 93.875, "completions/mean_terminated_length": 93.875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.03398936497978866, "epoch": 0.879837067209776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.774494259590594e-07, "loss": 0.0, "num_tokens": 6913929.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.031143828528001904, "epoch": 0.8805159538357095, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "learning_rate": 8.677628697289408e-07, "loss": -0.0, "num_tokens": 6919980.0, "reward": 2.8214285373687744, "reward_std": 0.3642157018184662, "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 84.875, "completions/mean_terminated_length": 84.875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.034618568839505315, "epoch": 0.8811948404616429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.581276522104198e-07, "loss": 0.0, "num_tokens": 6923619.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.046522431541234255, "epoch": 0.8818737270875764, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "learning_rate": 8.485438275698154e-07, "loss": 0.0, "num_tokens": 6927901.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.0452120965346694, "epoch": 0.8825526137135098, "frac_reward_zero_std": 0.0, "grad_norm": 7.03125, "learning_rate": 8.39011449684527e-07, "loss": -0.0, "num_tokens": 6932240.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 117.75, "completions/mean_terminated_length": 117.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.0458712843246758, "epoch": 0.8832315003394433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.295305721427371e-07, "loss": 0.0, "num_tokens": 6936238.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.02186664822511375, "epoch": 0.8839103869653768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.201012482431125e-07, "loss": 0.0, "num_tokens": 6940290.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.07473768247291446, "epoch": 0.8845892735913102, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "learning_rate": 8.10723530994496e-07, "loss": 0.0, "num_tokens": 6945725.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 130.375, "completions/mean_terminated_length": 130.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.07881530933082104, "epoch": 0.8852681602172437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.01397473115616e-07, "loss": 0.0, "num_tokens": 6949904.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.05573853477835655, "epoch": 0.8859470468431772, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "learning_rate": 7.921231270347851e-07, "loss": 0.0, "num_tokens": 6954277.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.07590067246928811, "epoch": 0.8866259334691107, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "learning_rate": 7.82900544889612e-07, "loss": -0.0, "num_tokens": 6959426.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 202.375, "completions/mean_terminated_length": 202.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.05649718875065446, "epoch": 0.8873048200950441, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "learning_rate": 7.737297785266995e-07, "loss": -0.0, "num_tokens": 6964973.0, "reward": 2.3499999046325684, "reward_std": 0.47509393095970154, "rewards/fixed_code_pass_all_test_reward/mean": 0.7250000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.14880476891994476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.024505326058715582, "epoch": 0.8879837067209776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.646108795013563e-07, "loss": 0.0, "num_tokens": 6969768.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.025752554647624493, "epoch": 0.8886625933469111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.555438990773134e-07, "loss": 0.0, "num_tokens": 6975335.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.0454097636975348, "epoch": 0.8893414799728445, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "learning_rate": 7.46528888226431e-07, "loss": 0.0, "num_tokens": 6979681.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 130.375, "completions/mean_terminated_length": 130.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.07535605737939477, "epoch": 0.890020366598778, "frac_reward_zero_std": 0.0, "grad_norm": 4.4375, "learning_rate": 7.375658976284073e-07, "loss": -0.0, "num_tokens": 6983980.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 311.75, "completions/mean_terminated_length": 311.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.04017899348400533, "epoch": 0.8906992532247114, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "learning_rate": 7.286549776704987e-07, "loss": -0.0, "num_tokens": 6991290.0, "reward": 1.7857142686843872, "reward_std": 0.3581618368625641, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.3581618070602417, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.07340986281633377, "epoch": 0.891378139850645, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "learning_rate": 7.197961784472396e-07, "loss": -0.0, "num_tokens": 6996083.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06486464524641633, "epoch": 0.8920570264765784, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "learning_rate": 7.109895497601571e-07, "loss": 0.0, "num_tokens": 7000312.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.0960701210424304, "epoch": 0.8927359131025119, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "learning_rate": 7.022351411174866e-07, "loss": -0.0, "num_tokens": 7005964.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.05812152964062989, "epoch": 0.8934147997284454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.935330017339003e-07, "loss": 0.0, "num_tokens": 7010124.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 260.875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.020657073939219117, "epoch": 0.8940936863543788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.848831805302314e-07, "loss": 0.0, "num_tokens": 7016587.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 116.125, "completions/mean_terminated_length": 116.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.0774011854082346, "epoch": 0.8947725729803123, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "learning_rate": 6.762857261331901e-07, "loss": 0.0, "num_tokens": 7021092.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 319.25, "completions/mean_terminated_length": 319.25, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.06596643687225878, "epoch": 0.8954514596062457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.677406868751013e-07, "loss": 0.0, "num_tokens": 7028726.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 154.5, "completions/mean_terminated_length": 154.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.042495643720030785, "epoch": 0.8961303462321792, "frac_reward_zero_std": 0.0, "grad_norm": 7.96875, "learning_rate": 6.592481107936243e-07, "loss": 0.0, "num_tokens": 7033154.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.05387642700225115, "epoch": 0.8968092328581126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.50808045631488e-07, "loss": 0.0, "num_tokens": 7037681.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.03018986596725881, "epoch": 0.8974881194840462, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "learning_rate": 6.424205388362203e-07, "loss": -0.0, "num_tokens": 7044757.0, "reward": 2.6666665077209473, "reward_std": 0.4714045226573944, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 242.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.031667630886659026, "epoch": 0.8981670061099797, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "learning_rate": 6.340856375598781e-07, "loss": -0.0, "num_tokens": 7051141.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.032366921193897724, "epoch": 0.8988458927359131, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "learning_rate": 6.258033886587911e-07, "loss": 0.0, "num_tokens": 7058505.0, "reward": 2.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 113.875, "completions/mean_terminated_length": 113.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.046867894008755684, "epoch": 0.8995247793618466, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.175738386932917e-07, "loss": 0.0, "num_tokens": 7062512.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06957761477679014, "epoch": 0.90020366598778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.093970339274513e-07, "loss": 0.0, "num_tokens": 7066800.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.06062587955966592, "epoch": 0.9008825526137135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.012730203288286e-07, "loss": 0.0, "num_tokens": 7071536.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.06407987233251333, "epoch": 0.9015614392396469, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "learning_rate": 5.932018435681985e-07, "loss": 0.0, "num_tokens": 7076170.0, "reward": 2.125, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.08480401104316115, "epoch": 0.9022403258655805, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "learning_rate": 5.851835490193136e-07, "loss": 0.0, "num_tokens": 7080770.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.042670343071222305, "epoch": 0.902919212491514, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "learning_rate": 5.772181817586309e-07, "loss": 0.0, "num_tokens": 7085543.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.03924781363457441, "epoch": 0.9035980991174474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.693057865650676e-07, "loss": 0.0, "num_tokens": 7090550.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 103.5, "completions/mean_terminated_length": 103.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.04539029020816088, "epoch": 0.9042769857433809, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.614464079197457e-07, "loss": 0.0, "num_tokens": 7094538.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.0610105381347239, "epoch": 0.9049558723693143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.536400900057493e-07, "loss": 0.0, "num_tokens": 7098786.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.04104882990941405, "epoch": 0.9056347589952478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.458868767078673e-07, "loss": 0.0, "num_tokens": 7103029.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.05672460934147239, "epoch": 0.9063136456211812, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "learning_rate": 5.381868116123512e-07, "loss": 0.0, "num_tokens": 7107349.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.0988643690943718, "epoch": 0.9069925322471147, "frac_reward_zero_std": 0.0, "grad_norm": 4.25, "learning_rate": 5.305399380066656e-07, "loss": 0.0, "num_tokens": 7112402.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.04725077305920422, "epoch": 0.9076714188730483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.229462988792566e-07, "loss": 0.0, "num_tokens": 7117836.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 107.5, "completions/mean_terminated_length": 107.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.04917873814702034, "epoch": 0.9083503054989817, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "learning_rate": 5.154059369192932e-07, "loss": 0.0, "num_tokens": 7121888.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.028185414150357246, "epoch": 0.9090291921249152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.079188945164426e-07, "loss": 0.0, "num_tokens": 7126591.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.0862139780074358, "epoch": 0.9097080787508486, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "learning_rate": 5.004852137606198e-07, "loss": 0.0, "num_tokens": 7131364.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 215.375, "completions/mean_terminated_length": 215.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.026261223945766687, "epoch": 0.9103869653767821, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.931049364417628e-07, "loss": 0.0, "num_tokens": 7137103.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.0418059267103672, "epoch": 0.9110658520027155, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "learning_rate": 4.857781040495857e-07, "loss": -0.0, "num_tokens": 7142012.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.09205880155786872, "epoch": 0.911744738628649, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "learning_rate": 4.785047577733515e-07, "loss": -0.0, "num_tokens": 7146984.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 335.0, "completions/mean_terminated_length": 335.0, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.011861839797347784, "epoch": 0.9124236252545825, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "learning_rate": 4.7128493850164715e-07, "loss": -0.0, "num_tokens": 7154784.0, "reward": 2.920454502105713, "reward_std": 0.03214118629693985, "rewards/fixed_code_pass_all_test_reward/mean": 0.9204546213150024, "rewards/fixed_code_pass_all_test_reward/std": 0.03214120864868164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.04227684112265706, "epoch": 0.9131025118805159, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6411868682213923e-07, "loss": 0.0, "num_tokens": 7160629.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 103.75, "completions/mean_terminated_length": 103.75, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.06012857146561146, "epoch": 0.9137813985064495, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5700604302135633e-07, "loss": 0.0, "num_tokens": 7164603.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.049112992361187935, "epoch": 0.9144602851323829, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4994704708445804e-07, "loss": 0.0, "num_tokens": 7168949.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 318.25, "completions/mean_terminated_length": 318.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.0391323184594512, "epoch": 0.9151391717583164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.42941738695013e-07, "loss": 0.0, "num_tokens": 7176591.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 241.5, "completions/mean_terminated_length": 241.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.0470014913007617, "epoch": 0.9158180583842498, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.359901572347758e-07, "loss": 0.0, "num_tokens": 7182963.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.034591716481372714, "epoch": 0.9164969450101833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.290923417834625e-07, "loss": 0.0, "num_tokens": 7190667.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.0638595069758594, "epoch": 0.9171758316361168, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "learning_rate": 4.222483311185299e-07, "loss": 0.0, "num_tokens": 7195423.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 171.875, "completions/mean_terminated_length": 171.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.047411042265594006, "epoch": 0.9178547182620502, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "learning_rate": 4.1545816371496685e-07, "loss": 0.0, "num_tokens": 7200046.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.03261782415211201, "epoch": 0.9185336048879837, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.087218777450652e-07, "loss": 0.0, "num_tokens": 7205218.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07520277053117752, "epoch": 0.9192124915139172, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "learning_rate": 4.02039511078216e-07, "loss": 0.0, "num_tokens": 7209591.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.046233424451202154, "epoch": 0.9198913781398507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.954111012806894e-07, "loss": 0.0, "num_tokens": 7214358.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 113.0, "completions/mean_terminated_length": 113.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.05395610770210624, "epoch": 0.9205702647657841, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8883668561542907e-07, "loss": 0.0, "num_tokens": 7218390.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 228.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.034961943863891065, "epoch": 0.9212491513917176, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "learning_rate": 3.8231630104183514e-07, "loss": 0.0, "num_tokens": 7223485.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 114.5, "completions/mean_terminated_length": 114.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.045125515665858984, "epoch": 0.921928038017651, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7584998421556387e-07, "loss": 0.0, "num_tokens": 7227665.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 134.625, "completions/mean_terminated_length": 134.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.10880998056381941, "epoch": 0.9226069246435845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6943777148831907e-07, "loss": 0.0, "num_tokens": 7231854.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.06575705483555794, "epoch": 0.923285811269518, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "learning_rate": 3.6307969890764907e-07, "loss": 0.0, "num_tokens": 7236687.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06530605629086494, "epoch": 0.9239646978954514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.567758022167378e-07, "loss": 0.0, "num_tokens": 7240930.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.04267252469435334, "epoch": 0.924643584521385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.505261168542107e-07, "loss": 0.0, "num_tokens": 7246581.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 109.625, "completions/mean_terminated_length": 109.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.06701449351385236, "epoch": 0.9253224711473184, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "learning_rate": 3.443306779539335e-07, "loss": 0.0, "num_tokens": 7250562.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.053723922930657864, "epoch": 0.9260013577732519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.381895203448182e-07, "loss": 0.0, "num_tokens": 7254815.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.0972116100601852, "epoch": 0.9266802443991853, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "learning_rate": 3.321026785506165e-07, "loss": 0.0, "num_tokens": 7259479.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.04474888020195067, "epoch": 0.9273591310251188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2607018678973646e-07, "loss": 0.0, "num_tokens": 7265047.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07023880514316261, "epoch": 0.9280380176510523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2009207897504945e-07, "loss": 0.0, "num_tokens": 7269637.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 322.125, "completions/mean_terminated_length": 322.125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.05936228530481458, "epoch": 0.9287169042769857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1416838871368925e-07, "loss": 0.0, "num_tokens": 7277262.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.08179679419845343, "epoch": 0.9293957909029192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0829914930687767e-07, "loss": 0.0, "num_tokens": 7281554.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 165.0, "completions/mean_terminated_length": 165.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.07411248236894608, "epoch": 0.9300746775288526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.024843937497246e-07, "loss": 0.0, "num_tokens": 7285914.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 299.25, "completions/mean_terminated_length": 299.25, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.032543769804760814, "epoch": 0.9307535641547862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9672415473105286e-07, "loss": 0.0, "num_tokens": 7292972.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.02955157309770584, "epoch": 0.9314324507807196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9101846463320483e-07, "loss": 0.0, "num_tokens": 7298065.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.08593884389847517, "epoch": 0.9321113374066531, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "learning_rate": 2.8536735553186814e-07, "loss": -0.0, "num_tokens": 7302577.0, "reward": 2.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.03686584485694766, "epoch": 0.9327902240325866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7977085919589253e-07, "loss": 0.0, "num_tokens": 7308554.0, "reward": 1.75, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 392.125, "completions/mean_terminated_length": 392.125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "entropy": 0.04892592295072973, "epoch": 0.93346911065852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7422900708710896e-07, "loss": 0.0, "num_tokens": 7317035.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.01983888167887926, "epoch": 0.9341479972844535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.687418303601563e-07, "loss": 0.0, "num_tokens": 7323089.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.05106785940006375, "epoch": 0.9348268839103869, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.633093598623049e-07, "loss": 0.0, "num_tokens": 7327449.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.07594262389466166, "epoch": 0.9355057705363204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5793162613328095e-07, "loss": 0.0, "num_tokens": 7332977.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 94.75, "completions/mean_terminated_length": 94.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.024748916970565915, "epoch": 0.9361846571622539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5260865940510027e-07, "loss": 0.0, "num_tokens": 7336751.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.07646798528730869, "epoch": 0.9368635437881874, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "learning_rate": 2.4734048960189385e-07, "loss": 0.0, "num_tokens": 7342579.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 109.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.03955674055032432, "epoch": 0.9375424304141209, "frac_reward_zero_std": 0.0, "grad_norm": 4.34375, "learning_rate": 2.421271463397368e-07, "loss": 0.0, "num_tokens": 7346805.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.04381275549530983, "epoch": 0.9382213170400543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.369686589264919e-07, "loss": 0.0, "num_tokens": 7351377.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.11116392724215984, "epoch": 0.9389002036659878, "frac_reward_zero_std": 0.0, "grad_norm": 4.28125, "learning_rate": 2.3186505636163316e-07, "loss": -0.0, "num_tokens": 7356087.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.04105406999588013, "epoch": 0.9395790902919212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2681636733609457e-07, "loss": 0.0, "num_tokens": 7361984.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 332.25, "completions/mean_terminated_length": 332.25, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.03562983754090965, "epoch": 0.9402579769178547, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "learning_rate": 2.2182262023209612e-07, "loss": -0.0, "num_tokens": 7369890.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.04823680454865098, "epoch": 0.9409368635437881, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.168838431229958e-07, "loss": 0.0, "num_tokens": 7374659.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.03606553701683879, "epoch": 0.9416157501697217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1200006377312232e-07, "loss": 0.0, "num_tokens": 7381224.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.027940819272771478, "epoch": 0.9422946367956552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.071713096376271e-07, "loss": 0.0, "num_tokens": 7387115.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.09008469432592392, "epoch": 0.9429735234215886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0239760786232355e-07, "loss": 0.0, "num_tokens": 7391661.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 139.5, "completions/mean_terminated_length": 139.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.03246856899932027, "epoch": 0.9436524100475221, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "learning_rate": 1.9767898528353923e-07, "loss": 0.0, "num_tokens": 7396529.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 242.25, "completions/mean_terminated_length": 242.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.03269170830026269, "epoch": 0.9443312966734555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.930154684279628e-07, "loss": 0.0, "num_tokens": 7402747.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.10386022925376892, "epoch": 0.945010183299389, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "learning_rate": 1.8840708351249182e-07, "loss": -0.0, "num_tokens": 7407836.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.030043622478842735, "epoch": 0.9456890699253224, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "learning_rate": 1.8385385644409282e-07, "loss": 0.0, "num_tokens": 7414166.0, "reward": 2.7708334922790527, "reward_std": 0.2946277856826782, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 147.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06427620584145188, "epoch": 0.9463679565512559, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.793558128196493e-07, "loss": 0.0, "num_tokens": 7418631.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 337.0, "completions/mean_terminated_length": 337.0, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 0.03572363778948784, "epoch": 0.9470468431771895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7491297792581963e-07, "loss": 0.0, "num_tokens": 7426527.0, "reward": 1.5714285373687744, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06719027180224657, "epoch": 0.9477257298031229, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.705253767388948e-07, "loss": 0.0, "num_tokens": 7431444.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.04964815126731992, "epoch": 0.9484046164290564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.661930339246609e-07, "loss": 0.0, "num_tokens": 7439361.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 278.375, "completions/mean_terminated_length": 278.375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.05206569563597441, "epoch": 0.9490835030549898, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6191597383825473e-07, "loss": 0.0, "num_tokens": 7446452.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.03718523355200887, "epoch": 0.9497623896809233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5769422052403172e-07, "loss": 0.0, "num_tokens": 7451196.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 119.5, "completions/mean_terminated_length": 119.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06675209756940603, "epoch": 0.9504412763068567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5352779771543037e-07, "loss": 0.0, "num_tokens": 7455328.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.04172407649457455, "epoch": 0.9511201629327902, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "learning_rate": 1.494167288348347e-07, "loss": -0.0, "num_tokens": 7460832.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.03628377616405487, "epoch": 0.9517990495587237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4536103699344884e-07, "loss": 0.0, "num_tokens": 7466263.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.033826621947810054, "epoch": 0.9524779361846571, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "learning_rate": 1.4136074499115914e-07, "loss": -0.0, "num_tokens": 7472449.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 108.5, "completions/mean_terminated_length": 108.5, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.046879804227501154, "epoch": 0.9531568228105907, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3741587531641566e-07, "loss": 0.0, "num_tokens": 7476541.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.050881276838481426, "epoch": 0.9538357094365241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3352645014609756e-07, "loss": 0.0, "num_tokens": 7480777.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 106.75, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.04698456637561321, "epoch": 0.9545145960624576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.296924913453923e-07, "loss": 0.0, "num_tokens": 7484919.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 143.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.035350932739675045, "epoch": 0.955193482688391, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "learning_rate": 1.259140204676712e-07, "loss": 0.0, "num_tokens": 7489881.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 67.125, "completions/mean_terminated_length": 67.125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.04363395366817713, "epoch": 0.9558723693143245, "frac_reward_zero_std": 0.0, "grad_norm": 9.3125, "learning_rate": 1.2219105875437176e-07, "loss": -0.0, "num_tokens": 7493466.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.032257709885016084, "epoch": 0.956551255940258, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "learning_rate": 1.185236271348722e-07, "loss": -0.0, "num_tokens": 7500434.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.06615744018927217, "epoch": 0.9572301425661914, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "learning_rate": 1.1491174622637934e-07, "loss": 0.0, "num_tokens": 7504933.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.06036748504266143, "epoch": 0.957909029192125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1135543633380764e-07, "loss": 0.0, "num_tokens": 7509322.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 231.375, "completions/mean_terminated_length": 231.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.027164625702425838, "epoch": 0.9585879158180584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0785471744967247e-07, "loss": 0.0, "num_tokens": 7515365.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 183.75, "completions/mean_terminated_length": 183.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.0438750758767128, "epoch": 0.9592668024439919, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "learning_rate": 1.0440960925396925e-07, "loss": 0.0, "num_tokens": 7520043.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.09158069361001253, "epoch": 0.9599456890699253, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0102013111406905e-07, "loss": 0.0, "num_tokens": 7526316.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05266101472079754, "epoch": 0.9606245756958588, "frac_reward_zero_std": 0.0, "grad_norm": 4.84375, "learning_rate": 9.768630208460528e-08, "loss": 0.0, "num_tokens": 7530595.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.05456627393141389, "epoch": 0.9613034623217923, "frac_reward_zero_std": 0.0, "grad_norm": 3.859375, "learning_rate": 9.440814090737049e-08, "loss": 0.0, "num_tokens": 7536369.0, "reward": 1.821428656578064, "reward_std": 0.10101527720689774, "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.10101524740457535, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 465.25, "completions/mean_terminated_length": 465.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.020693805068731308, "epoch": 0.9619823489477257, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "learning_rate": 9.11856660112076e-08, "loss": 0.0, "num_tokens": 7545723.0, "reward": 0.9285714626312256, "reward_std": 0.2020305097103119, "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 111.875, "completions/mean_terminated_length": 111.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07267671870067716, "epoch": 0.9626612355736592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.80188955119099e-08, "loss": 0.0, "num_tokens": 7549778.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.05876992456614971, "epoch": 0.9633401221995926, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "learning_rate": 8.490784721211454e-08, "loss": -0.0, "num_tokens": 7554368.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 140.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.055333992931991816, "epoch": 0.9640190088255262, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.185253860120701e-08, "loss": 0.0, "num_tokens": 7558879.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.056854897644370794, "epoch": 0.9646978954514596, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "learning_rate": 7.885298685522235e-08, "loss": -0.0, "num_tokens": 7566177.0, "reward": 1.5178570747375488, "reward_std": 0.07393556088209152, "rewards/fixed_code_pass_all_test_reward/mean": 0.5178571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 75.375, "completions/mean_terminated_length": 75.375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.06834522588178515, "epoch": 0.9653767820773931, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.590920883674192e-08, "loss": 0.0, "num_tokens": 7570020.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 129.125, "completions/mean_terminated_length": 129.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.05737362802028656, "epoch": 0.9660556687033266, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.302122109481002e-08, "loss": 0.0, "num_tokens": 7574165.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.039678098633885384, "epoch": 0.96673455532926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.018903986483083e-08, "loss": 0.0, "num_tokens": 7578412.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.045340674463659525, "epoch": 0.9674134419551935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.741268106848164e-08, "loss": 0.0, "num_tokens": 7582609.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 107.875, "completions/mean_terminated_length": 107.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06596353463828564, "epoch": 0.9680923285811269, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.469216031362302e-08, "loss": 0.0, "num_tokens": 7586576.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 90.75, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.04027901426889002, "epoch": 0.9687712152070604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.202749289420994e-08, "loss": 0.0, "num_tokens": 7590366.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.051211504731327295, "epoch": 0.9694501018329938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.9418693790209705e-08, "loss": 0.0, "num_tokens": 7595167.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.021212840685620904, "epoch": 0.9701289884589274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.686577766751078e-08, "loss": 0.0, "num_tokens": 7601230.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 399.75, "completions/mean_terminated_length": 399.75, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.0578672387637198, "epoch": 0.9708078750848609, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "learning_rate": 5.4368758877845204e-08, "loss": 0.0, "num_tokens": 7609748.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 267.5, "completions/mean_terminated_length": 267.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.04076833697035909, "epoch": 0.9714867617107943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.192765145870748e-08, "loss": 0.0, "num_tokens": 7616600.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.05700752744451165, "epoch": 0.9721656483367278, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.954246913327576e-08, "loss": 0.0, "num_tokens": 7620770.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.03856371785514057, "epoch": 0.9728445349626612, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.721322531033301e-08, "loss": 0.0, "num_tokens": 7626480.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 123.125, "completions/mean_terminated_length": 123.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.051664297003299, "epoch": 0.9735234215885947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4939933084192646e-08, "loss": 0.0, "num_tokens": 7630817.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 80.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.018483178922906518, "epoch": 0.9742023082145281, "frac_reward_zero_std": 0.0, "grad_norm": 12.9375, "learning_rate": 4.2722605234625236e-08, "loss": -0.0, "num_tokens": 7634553.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 210.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.03427370940335095, "epoch": 0.9748811948404617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.0561254226786365e-08, "loss": 0.0, "num_tokens": 7640390.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.029702861327677965, "epoch": 0.9755600814663951, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "learning_rate": 3.845589221114554e-08, "loss": 0.0, "num_tokens": 7646963.0, "reward": 1.9305555820465088, "reward_std": 0.05750548839569092, "rewards/fixed_code_pass_all_test_reward/mean": 0.930555522441864, "rewards/fixed_code_pass_all_test_reward/std": 0.05750546231865883, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.021351289004087448, "epoch": 0.9762389680923286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.6406531023420735e-08, "loss": 0.0, "num_tokens": 7653374.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 163.5, "completions/mean_terminated_length": 163.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.060278112068772316, "epoch": 0.9769178547182621, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4413182184507285e-08, "loss": 0.0, "num_tokens": 7658154.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.10810685250908136, "epoch": 0.9775967413441955, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "learning_rate": 3.24758569004191e-08, "loss": 0.0, "num_tokens": 7663355.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 140.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.03728607762604952, "epoch": 0.978275627970129, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "learning_rate": 3.0594566062219776e-08, "loss": -0.0, "num_tokens": 7668338.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.04085223888978362, "epoch": 0.9789545145960624, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "learning_rate": 2.8769320245966014e-08, "loss": -0.0, "num_tokens": 7672973.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.07143508875742555, "epoch": 0.9796334012219959, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7000129712643208e-08, "loss": 0.0, "num_tokens": 7677123.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.03794521884992719, "epoch": 0.9803122878479293, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.528700440811438e-08, "loss": 0.0, "num_tokens": 7681637.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.04380248347297311, "epoch": 0.9809911744738629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3629953963058007e-08, "loss": 0.0, "num_tokens": 7686278.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.07558493688702583, "epoch": 0.9816700610997964, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "learning_rate": 2.2028987692915836e-08, "loss": -0.0, "num_tokens": 7690592.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.05028433492407203, "epoch": 0.9823489477257298, "frac_reward_zero_std": 0.0, "grad_norm": 3.453125, "learning_rate": 2.048411459784516e-08, "loss": -0.0, "num_tokens": 7695587.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 386.25, "completions/mean_terminated_length": 386.25, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "entropy": 0.12325855065137148, "epoch": 0.9830278343516633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8995343362658846e-08, "loss": 0.0, "num_tokens": 7704501.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.05211532860994339, "epoch": 0.9837067209775967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7562682356786488e-08, "loss": 0.0, "num_tokens": 7709928.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.15142503194510937, "epoch": 0.9843856076035302, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.618613963421889e-08, "loss": 0.0, "num_tokens": 7714228.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.10641594044864178, "epoch": 0.9850644942294636, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "learning_rate": 1.4865722933469218e-08, "loss": 0.0, "num_tokens": 7719873.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 186.625, "completions/mean_terminated_length": 186.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.0543354912661016, "epoch": 0.9857433808553971, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "learning_rate": 1.3601439677526363e-08, "loss": -0.0, "num_tokens": 7724758.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 86.375, "completions/mean_terminated_length": 86.375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.048309564124792814, "epoch": 0.9864222674813307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2393296973812751e-08, "loss": 0.0, "num_tokens": 7728489.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.09527547471225262, "epoch": 0.9871011541072641, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "learning_rate": 1.1241301614147715e-08, "loss": 0.0, "num_tokens": 7733474.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.11967422068119049, "epoch": 0.9877800407331976, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "learning_rate": 1.0145460074703073e-08, "loss": -0.0, "num_tokens": 7738806.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 397.125, "completions/mean_terminated_length": 397.125, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.02805103559512645, "epoch": 0.988458927359131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.105778515974273e-09, "loss": 0.0, "num_tokens": 7747687.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.031156501034274697, "epoch": 0.9891378139850645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.12226278274042e-09, "loss": 0.0, "num_tokens": 7752132.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 197.0, "completions/mean_terminated_length": 197.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.06611996795982122, "epoch": 0.9898167006109979, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.194918404033191e-09, "loss": 0.0, "num_tokens": 7757668.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.051881998777389526, "epoch": 0.9904955872369314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.323750593106859e-09, "loss": 0.0, "num_tokens": 7762074.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.08997983951121569, "epoch": 0.991174473862865, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "learning_rate": 5.508764247406096e-09, "loss": -0.0, "num_tokens": 7766897.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 236.75, "completions/mean_terminated_length": 236.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.0508587802760303, "epoch": 0.9918533604887984, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "learning_rate": 4.749963948540437e-09, "loss": 0.0, "num_tokens": 7773279.0, "reward": 2.8500001430511475, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.8499999642372131, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.10342225804924965, "epoch": 0.9925322471147319, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "learning_rate": 4.047353962259859e-09, "loss": 0.0, "num_tokens": 7777941.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.03919437597505748, "epoch": 0.9932111337406653, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "learning_rate": 3.4009382384270206e-09, "loss": -0.0, "num_tokens": 7783147.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.05942561570554972, "epoch": 0.9938900203665988, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "learning_rate": 2.810720410998391e-09, "loss": 0.0, "num_tokens": 7787527.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.05872875917702913, "epoch": 0.9945689069925322, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2767037980031548e-09, "loss": 0.0, "num_tokens": 7792116.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 157.75, "completions/mean_terminated_length": 157.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06247218558564782, "epoch": 0.9952477936184657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7988914015221182e-09, "loss": 0.0, "num_tokens": 7796626.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.0569450743496418, "epoch": 0.9959266802443992, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "learning_rate": 1.3772859076754962e-09, "loss": 0.0, "num_tokens": 7802242.0, "reward": 1.9375, "reward_std": 0.08625822514295578, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.08625820279121399, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 162.625, "completions/mean_terminated_length": 162.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.04403562285006046, "epoch": 0.9966055668703326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0118896866018192e-09, "loss": 0.0, "num_tokens": 7806639.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.0495642744936049, "epoch": 0.9972844534962662, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.027047924512698e-10, "loss": 0.0, "num_tokens": 7810885.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.05805972497910261, "epoch": 0.9979633401221996, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "learning_rate": 4.497329633679215e-10, "loss": -0.0, "num_tokens": 7815819.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.05528525682166219, "epoch": 0.9986422267481331, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.529756214841861e-10, "loss": 0.0, "num_tokens": 7820563.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 106.875, "completions/mean_terminated_length": 106.875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.053659217432141304, "epoch": 0.9993211133740665, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1243387291082208e-10, "loss": 0.0, "num_tokens": 7824490.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 127.375, "completions/mean_terminated_length": 127.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.049180077854543924, "epoch": 1.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.810850773249385e-11, "loss": 0.0, "num_tokens": 7828629.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1473 }, { "epoch": 1.0, "step": 1473, "total_flos": 0.0, "train_loss": -4.9164727360556425e-09, "train_runtime": 11457.4856, "train_samples_per_second": 0.129, "train_steps_per_second": 0.129 } ], "logging_steps": 1, "max_steps": 1473, "num_input_tokens_seen": 7828629, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }