{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3689356207341819, "eval_steps": 5000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 532.125, "completions/mean_terminated_length": 532.125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.00018446781036709093, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 8641.0, "reward": 1.9572510719299316, "reward_std": 0.6216927766799927, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20725108683109283, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23192013800144196, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 506.5, "completions/mean_terminated_length": 286.2857360839844, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.00036893562073418186, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.0, "learning_rate": 1.2292562999385371e-08, "loss": -0.0, "num_tokens": 17997.0, "reward": 0.949999988079071, "reward_std": 0.6740071773529053, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14880475401878357, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 476.0, "completions/mean_terminated_length": 476.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.0005534034311012728, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.00018203282979811775, "learning_rate": 2.4585125998770742e-08, "loss": 0.0, "num_tokens": 28037.0, "reward": 1.3327381610870361, "reward_std": 0.8638866543769836, "rewards/fixed_code_pass_all_test_reward/mean": 0.5166666507720947, "rewards/fixed_code_pass_all_test_reward/std": 0.38089287281036377, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06607142835855484, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10496078431606293, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 505.25, "completions/mean_terminated_length": 505.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.0007378712414683637, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.0002038300171989249, "learning_rate": 3.687768899815612e-08, "loss": 0.0, "num_tokens": 38903.0, "reward": 1.274999976158142, "reward_std": 0.9130013585090637, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 427.875, "completions/mean_terminated_length": 427.875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.0009223390518354548, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.00019845250426442362, "learning_rate": 4.9170251997541484e-08, "loss": 0.0, "num_tokens": 46574.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 525.125, "completions/mean_terminated_length": 525.125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.0011068068622025456, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.00018116733190254308, "learning_rate": 6.146281499692686e-08, "loss": 0.0, "num_tokens": 57327.0, "reward": 1.056249976158142, "reward_std": 0.9983692169189453, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05624999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10500850528478622, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 489.125, "completions/mean_terminated_length": 489.125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.0012912746725696365, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.00021278067288221791, "learning_rate": 7.375537799631224e-08, "loss": 0.0, "num_tokens": 66664.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 313.0, "completions/mean_terminated_length": 313.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.0014757424829367274, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.00025771011951292166, "learning_rate": 8.604794099569762e-08, "loss": 0.0, "num_tokens": 72288.0, "reward": 1.1986110210418701, "reward_std": 0.4143134653568268, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07361111044883728, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1026865765452385, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 509.75, "completions/mean_terminated_length": 509.75, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.0016602102933038186, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.0001899802564366837, "learning_rate": 9.834050399508297e-08, "loss": 0.0, "num_tokens": 83478.0, "reward": 0.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 642.625, "completions/mean_terminated_length": 642.625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.0018446781036709095, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.00023924431479827035, "learning_rate": 1.1063306699446835e-07, "loss": 0.0, "num_tokens": 93267.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.0020291459140380002, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0001977622105187038, "learning_rate": 1.2292562999385372e-07, "loss": 0.0, "num_tokens": 97610.0, "reward": 1.037500023841858, "reward_std": 1.223504900932312, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16249999403953552, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3113909065723419, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 726.375, "completions/mean_terminated_length": 726.375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.002213613724405091, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.00024060228952293983, "learning_rate": 1.352181929932391e-07, "loss": 0.0, "num_tokens": 113157.0, "reward": 1.399999976158142, "reward_std": 0.386836975812912, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.34069257974624634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 373.125, "completions/mean_terminated_length": 373.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.002398081534772182, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.00015376139162981417, "learning_rate": 1.4751075599262448e-07, "loss": 0.0, "num_tokens": 120118.0, "reward": 0.625, "reward_std": 0.5101463794708252, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.13598208129405975, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 327.125, "completions/mean_terminated_length": 327.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.002582549345139273, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.00018653134065971244, "learning_rate": 1.5980331899200987e-07, "loss": 0.0, "num_tokens": 127775.0, "reward": 0.949999988079071, "reward_std": 0.690755307674408, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.38544967770576477, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 481.0, "completions/mean_terminated_length": 481.0, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.002767017155506364, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.00018443438875692664, "learning_rate": 1.7209588199139523e-07, "loss": 0.0, "num_tokens": 137463.0, "reward": 1.7588067054748535, "reward_std": 0.3596174716949463, "rewards/fixed_code_pass_all_test_reward/mean": 0.7431818246841431, "rewards/fixed_code_pass_all_test_reward/std": 0.3438015878200531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.015625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.002951484965873455, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.00016786194009910105, "learning_rate": 1.843884449907806e-07, "loss": 0.0, "num_tokens": 142008.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 520.75, "completions/mean_terminated_length": 520.75, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.003135952776240546, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.00016485463766002795, "learning_rate": 1.9668100799016594e-07, "loss": 0.0, "num_tokens": 151278.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 447.5, "completions/mean_terminated_length": 447.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.003320420586607637, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.00017965910001294105, "learning_rate": 2.0897357098955133e-07, "loss": 0.0, "num_tokens": 159050.0, "reward": 0.637499988079071, "reward_std": 0.7360464334487915, "rewards/fixed_code_pass_all_test_reward/mean": 0.38749998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.507423460483551, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.003504888396974728, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0001961755824595457, "learning_rate": 2.212661339889367e-07, "loss": 0.0, "num_tokens": 163412.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 839.0, "completions/mean_terminated_length": 839.0, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.003689356207341819, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.00011436882414272986, "learning_rate": 2.3355869698832208e-07, "loss": 0.0, "num_tokens": 180724.0, "reward": 0.6488636136054993, "reward_std": 0.5387671589851379, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02386363595724106, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04425361752510071, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.00387382401770891, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.0002771796989691211, "learning_rate": 2.4585125998770745e-07, "loss": 0.0, "num_tokens": 185597.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 552.875, "completions/mean_terminated_length": 552.875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.0040582918280760005, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.00019327198697283166, "learning_rate": 2.581438229870928e-07, "loss": 0.0, "num_tokens": 199396.0, "reward": 0.8185184597969055, "reward_std": 0.5110114216804504, "rewards/fixed_code_pass_all_test_reward/mean": 0.018518518656492233, "rewards/fixed_code_pass_all_test_reward/std": 0.019797129556536674, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 515.875, "completions/mean_terminated_length": 515.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.004242759638443091, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.0002650674423421151, "learning_rate": 2.704363859864782e-07, "loss": 0.0, "num_tokens": 208867.0, "reward": 0.5160714387893677, "reward_std": 0.5404046773910522, "rewards/fixed_code_pass_all_test_reward/mean": 0.01607142947614193, "rewards/fixed_code_pass_all_test_reward/std": 0.01415758952498436, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 663.0, "completions/mean_terminated_length": 663.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.004427227448810182, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.00028305591695243493, "learning_rate": 2.827289489858636e-07, "loss": 0.0, "num_tokens": 225259.0, "reward": 0.7124999761581421, "reward_std": 1.0439451932907104, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 870.0, "completions/mean_terminated_length": 870.0, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.004611695259177273, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.00019939170488214586, "learning_rate": 2.9502151198524896e-07, "loss": 0.0, "num_tokens": 242155.0, "reward": 1.429464340209961, "reward_std": 0.7685761451721191, "rewards/fixed_code_pass_all_test_reward/mean": 0.4107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.19705888628959656, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.26875001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22894470393657684, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 585.75, "completions/mean_terminated_length": 585.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.004796163069544364, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.0002929961610789178, "learning_rate": 3.073140749846343e-07, "loss": 0.0, "num_tokens": 252401.0, "reward": 0.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 212.125, "completions/mean_terminated_length": 212.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.004980630879911455, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.00018967878168041352, "learning_rate": 3.1960663798401974e-07, "loss": 0.0, "num_tokens": 256954.0, "reward": 1.9666666984558105, "reward_std": 0.8125991821289062, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.34166663885116577, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3155997097492218, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 403.75, "completions/mean_terminated_length": 403.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.005165098690278546, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.00019093263108516112, "learning_rate": 3.318992009834051e-07, "loss": 0.0, "num_tokens": 266360.0, "reward": 1.5811011791229248, "reward_std": 0.6634544730186462, "rewards/fixed_code_pass_all_test_reward/mean": 0.6904761791229248, "rewards/fixed_code_pass_all_test_reward/std": 0.3376781940460205, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.015625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.005349566500645637, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.00014095166670813342, "learning_rate": 3.4419176398279047e-07, "loss": 0.0, "num_tokens": 276237.0, "reward": 1.7429605722427368, "reward_std": 0.3520623743534088, "rewards/fixed_code_pass_all_test_reward/mean": 0.4878048896789551, "rewards/fixed_code_pass_all_test_reward/std": 0.3217294216156006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2551557421684265, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07695016264915466, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 552.125, "completions/mean_terminated_length": 552.125, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.005534034311012728, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.00022377472214429872, "learning_rate": 3.5648432698217583e-07, "loss": 0.0, "num_tokens": 285614.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 447.5, "completions/mean_terminated_length": 447.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.005718502121379819, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.000204992445105745, "learning_rate": 3.687768899815612e-07, "loss": 0.0, "num_tokens": 295674.0, "reward": 1.3546874523162842, "reward_std": 0.6637769937515259, "rewards/fixed_code_pass_all_test_reward/mean": 0.5546875, "rewards/fixed_code_pass_all_test_reward/std": 0.30424362421035767, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10690449923276901, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 324.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.00590296993174691, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.00015720547207820346, "learning_rate": 3.810694529809465e-07, "loss": 0.0, "num_tokens": 302225.0, "reward": 1.2000000476837158, "reward_std": 0.7171371579170227, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 329.375, "completions/mean_terminated_length": 329.375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.006087437742114001, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.0002552735295466846, "learning_rate": 3.9336201598033187e-07, "loss": 0.0, "num_tokens": 311692.0, "reward": 1.1553571224212646, "reward_std": 0.17477713525295258, "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13750000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16372402012348175, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 308.75, "completions/mean_terminated_length": 308.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.006271905552481092, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.00011653448427750845, "learning_rate": 4.056545789797173e-07, "loss": 0.0, "num_tokens": 317330.0, "reward": 1.258333444595337, "reward_std": 1.0144590139389038, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13333334028720856, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2569664418697357, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 421.375, "completions/mean_terminated_length": 421.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.006456373362848183, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.00021889815343456576, "learning_rate": 4.1794714197910265e-07, "loss": 0.0, "num_tokens": 324917.0, "reward": 1.4589285850524902, "reward_std": 0.6702524423599243, "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.3095892369747162, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13750000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20658794045448303, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 546.625, "completions/mean_terminated_length": 546.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.006640841173215274, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.00019977040756202769, "learning_rate": 4.30239704978488e-07, "loss": 0.0, "num_tokens": 337530.0, "reward": 1.285416603088379, "reward_std": 0.7461774349212646, "rewards/fixed_code_pass_all_test_reward/mean": 0.6604166626930237, "rewards/fixed_code_pass_all_test_reward/std": 0.32949426770210266, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 541.625, "completions/mean_terminated_length": 541.625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.006825308983582365, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.00025595336592232343, "learning_rate": 4.425322679778734e-07, "loss": 0.0, "num_tokens": 349223.0, "reward": 1.274999976158142, "reward_std": 0.9130013585090637, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 488.875, "completions/mean_terminated_length": 488.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.007009776793949456, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.00021523882060137112, "learning_rate": 4.5482483097725875e-07, "loss": 0.0, "num_tokens": 357574.0, "reward": 0.8759920597076416, "reward_std": 0.6182137727737427, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.2441680133342743, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0902777761220932, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18290594220161438, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1484.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 703.875, "completions/mean_terminated_length": 703.875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.007194244604316547, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.000257832443821826, "learning_rate": 4.6711739397664416e-07, "loss": 0.0, "num_tokens": 368517.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 477.5, "completions/mean_terminated_length": 477.5, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.007378712414683638, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0002517298598831985, "learning_rate": 4.794099569760295e-07, "loss": 0.0, "num_tokens": 376833.0, "reward": 0.625, "reward_std": 0.4464142918586731, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.2976095378398895, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 370.125, "completions/mean_terminated_length": 370.125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.007563180225050729, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.0002058015852526296, "learning_rate": 4.917025199754149e-07, "loss": 0.0, "num_tokens": 384482.0, "reward": 0.7265625, "reward_std": 0.5717859268188477, "rewards/fixed_code_pass_all_test_reward/mean": 0.0703125, "rewards/fixed_code_pass_all_test_reward/std": 0.09704047441482544, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 353.875, "completions/mean_terminated_length": 353.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.00774764803541782, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.000211220612982288, "learning_rate": 5.039950829748003e-07, "loss": 0.0, "num_tokens": 393177.0, "reward": 1.6734604835510254, "reward_std": 0.8881027698516846, "rewards/fixed_code_pass_all_test_reward/mean": 0.5431034564971924, "rewards/fixed_code_pass_all_test_reward/std": 0.4941478669643402, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.38035714626312256, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.334501713514328, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 450.75, "completions/mean_terminated_length": 450.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.00793211584578491, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.000190985512745101, "learning_rate": 5.162876459741856e-07, "loss": 0.0, "num_tokens": 403447.0, "reward": 1.5208333730697632, "reward_std": 0.7495898008346558, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3958333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3484375774860382, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.008116583656152001, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.000210995611269027, "learning_rate": 5.28580208973571e-07, "loss": 0.0, "num_tokens": 408395.0, "reward": 0.8607639074325562, "reward_std": 0.990953803062439, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.23576389253139496, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2600088119506836, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 375.25, "completions/mean_terminated_length": 375.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.008301051466519093, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.0002680025118024787, "learning_rate": 5.408727719729564e-07, "loss": 0.0, "num_tokens": 416285.0, "reward": 0.5669642686843872, "reward_std": 0.6098192930221558, "rewards/fixed_code_pass_all_test_reward/mean": 0.0357142873108387, "rewards/fixed_code_pass_all_test_reward/std": 0.06613001227378845, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 358.875, "completions/mean_terminated_length": 358.875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.008485519276886183, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.00016605524069746025, "learning_rate": 5.531653349723417e-07, "loss": 0.0, "num_tokens": 423092.0, "reward": 1.7406249046325684, "reward_std": 0.6138195991516113, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11562499403953552, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09904679656028748, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 460.0, "completions/mean_terminated_length": 460.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.008669987087253275, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0001658068740653107, "learning_rate": 5.654578979717272e-07, "loss": 0.0, "num_tokens": 432660.0, "reward": 1.3404452800750732, "reward_std": 0.5199407935142517, "rewards/fixed_code_pass_all_test_reward/mean": 0.45408162474632263, "rewards/fixed_code_pass_all_test_reward/std": 0.2953900992870331, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.011363636702299118, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.032141219824552536, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 418.625, "completions/mean_terminated_length": 418.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.008854454897620365, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.00020066913884875248, "learning_rate": 5.777504609711126e-07, "loss": 0.0, "num_tokens": 444569.0, "reward": 1.275529146194458, "reward_std": 0.21255390346050262, "rewards/fixed_code_pass_all_test_reward/mean": 0.06481481343507767, "rewards/fixed_code_pass_all_test_reward/std": 0.09338287264108658, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21071428060531616, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2512779235839844, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.009038922707987456, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.0001991905301110819, "learning_rate": 5.900430239704979e-07, "loss": 0.0, "num_tokens": 449507.0, "reward": 1.0272727012634277, "reward_std": 1.427276849746704, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2772727310657501, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.41651275753974915, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 552.375, "completions/mean_terminated_length": 552.375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.009223390518354546, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.00013304955882631475, "learning_rate": 6.023355869698833e-07, "loss": 0.0, "num_tokens": 458934.0, "reward": 1.25, "reward_std": 0.34503281116485596, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.009407858328721638, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.00020546637460938655, "learning_rate": 6.146281499692686e-07, "loss": 0.0, "num_tokens": 466573.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 531.125, "completions/mean_terminated_length": 531.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.009592326139088728, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.00022698456541547785, "learning_rate": 6.26920712968654e-07, "loss": 0.0, "num_tokens": 477198.0, "reward": 1.0206395387649536, "reward_std": 1.0813695192337036, "rewards/fixed_code_pass_all_test_reward/mean": 0.18313953280448914, "rewards/fixed_code_pass_all_test_reward/std": 0.34233981370925903, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3375000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45650067925453186, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.00977679394945582, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0001459388140574447, "learning_rate": 6.392132759680395e-07, "loss": 0.0, "num_tokens": 482130.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 557.0, "completions/mean_terminated_length": 557.0, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.00996126175982291, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.00019490843624225818, "learning_rate": 6.515058389674247e-07, "loss": 0.0, "num_tokens": 493074.0, "reward": 1.043055772781372, "reward_std": 0.6553028225898743, "rewards/fixed_code_pass_all_test_reward/mean": 0.11742424964904785, "rewards/fixed_code_pass_all_test_reward/std": 0.13301566243171692, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17563162744045258, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1431049108505249, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 692.375, "completions/mean_terminated_length": 692.375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.010145729570190002, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.00022079312748246593, "learning_rate": 6.637984019668102e-07, "loss": 0.0, "num_tokens": 507941.0, "reward": 0.9807692766189575, "reward_std": 0.3334037661552429, "rewards/fixed_code_pass_all_test_reward/mean": 0.10576923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.14789307117462158, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 431.375, "completions/mean_terminated_length": 431.375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.010330197380557092, "frac_reward_zero_std": 0.0, "grad_norm": 6.59375, "kl": 0.0002935260308731813, "learning_rate": 6.760909649661955e-07, "loss": 0.0, "num_tokens": 518216.0, "reward": 1.409221887588501, "reward_std": 0.8798056244850159, "rewards/fixed_code_pass_all_test_reward/mean": 0.5485074520111084, "rewards/fixed_code_pass_all_test_reward/std": 0.35100704431533813, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11071428656578064, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11817366629838943, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 585.125, "completions/mean_terminated_length": 585.125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.010514665190924184, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.0002699543947528582, "learning_rate": 6.883835279655809e-07, "loss": 0.0, "num_tokens": 530281.0, "reward": 1.1553571224212646, "reward_std": 1.0344274044036865, "rewards/fixed_code_pass_all_test_reward/mean": 0.3482142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.2986546754837036, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.30714285373687744, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38172540068626404, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.010699133001291274, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.0001766249544061793, "learning_rate": 7.006760909649663e-07, "loss": 0.0, "num_tokens": 534684.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.010883600811658366, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.00014928713881090516, "learning_rate": 7.129686539643517e-07, "loss": 0.0, "num_tokens": 539168.0, "reward": 0.7666666507720947, "reward_std": 0.8447785377502441, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2666666507720947, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35096240043640137, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 510.125, "completions/mean_terminated_length": 290.4285888671875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.011068068622025456, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0001965821775229415, "learning_rate": 7.25261216963737e-07, "loss": 0.0, "num_tokens": 546265.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 361.0, "completions/mean_terminated_length": 361.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.011252536432392548, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.00024184837275242899, "learning_rate": 7.375537799631224e-07, "loss": 0.0, "num_tokens": 553689.0, "reward": 0.7124999761581421, "reward_std": 0.5157634019851685, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.08625819534063339, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.011437004242759638, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.00011721231749106664, "learning_rate": 7.498463429625078e-07, "loss": 0.0, "num_tokens": 557809.0, "reward": 2.164583444595337, "reward_std": 0.7512125968933105, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5395833253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.277165025472641, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 236.75, "completions/mean_terminated_length": 236.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.01162147205312673, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.00021503170501091518, "learning_rate": 7.62138905961893e-07, "loss": 0.0, "num_tokens": 562879.0, "reward": 1.1749999523162842, "reward_std": 0.6798109412193298, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.01180593986349382, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.00019019987848878372, "learning_rate": 7.744314689612785e-07, "loss": 0.0, "num_tokens": 572485.0, "reward": 1.1767857074737549, "reward_std": 0.20173266530036926, "rewards/fixed_code_pass_all_test_reward/mean": 0.0892857164144516, "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.011990407673860911, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.00016406252711931302, "learning_rate": 7.867240319606637e-07, "loss": 0.0, "num_tokens": 579433.0, "reward": 1.853896141052246, "reward_std": 1.3117663860321045, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4788961112499237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.44419366121292114, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.012174875484228001, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0001580623554673366, "learning_rate": 7.990165949600492e-07, "loss": 0.0, "num_tokens": 583780.0, "reward": 1.1840277910232544, "reward_std": 0.6370639204978943, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0590277761220932, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10955005139112473, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1285.0, "completions/max_terminated_length": 1285.0, "completions/mean_length": 611.375, "completions/mean_terminated_length": 611.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.012359343294595093, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.00023828564917494077, "learning_rate": 8.113091579594346e-07, "loss": 0.0, "num_tokens": 597943.0, "reward": 0.7749999761581421, "reward_std": 0.5496752262115479, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.14880476891994476, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 470.875, "completions/mean_terminated_length": 470.875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.012543811104962183, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.00013847056288796011, "learning_rate": 8.236017209588199e-07, "loss": 0.0, "num_tokens": 606622.0, "reward": 1.0833333730697632, "reward_std": 0.5841830968856812, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 865.75, "completions/mean_terminated_length": 696.857177734375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.012728278915329275, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.00016649827102810377, "learning_rate": 8.358942839582053e-07, "loss": 0.0, "num_tokens": 622548.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 661.25, "completions/mean_terminated_length": 661.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.012912746725696367, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.00018361350066697923, "learning_rate": 8.481868469575908e-07, "loss": 0.0, "num_tokens": 634918.0, "reward": 1.0840277671813965, "reward_std": 0.45492786169052124, "rewards/fixed_code_pass_all_test_reward/mean": 0.0486111119389534, "rewards/fixed_code_pass_all_test_reward/std": 0.046362388879060745, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16041666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12446313351392746, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 332.625, "completions/mean_terminated_length": 332.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.013097214536063457, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.00012149865779065294, "learning_rate": 8.60479409956976e-07, "loss": 0.0, "num_tokens": 641251.0, "reward": 1.7311147451400757, "reward_std": 0.8308025002479553, "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.3657134771347046, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2132575809955597, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16832265257835388, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 399.25, "completions/mean_terminated_length": 399.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.013281682346430549, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.0002894560730055673, "learning_rate": 8.727719729563615e-07, "loss": 0.0, "num_tokens": 648109.0, "reward": 0.8583333492279053, "reward_std": 0.5421151518821716, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.1649915874004364, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 643.75, "completions/mean_terminated_length": 443.14288330078125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.013466150156797639, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.0002160662716050865, "learning_rate": 8.850645359557468e-07, "loss": 0.0, "num_tokens": 659091.0, "reward": 1.6887930631637573, "reward_std": 0.9418144822120667, "rewards/fixed_code_pass_all_test_reward/mean": 0.7887930870056152, "rewards/fixed_code_pass_all_test_reward/std": 0.3997780680656433, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2750000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25495100021362305, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.01365061796716473, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0001787458418220922, "learning_rate": 8.973570989551322e-07, "loss": 0.0, "num_tokens": 663544.0, "reward": 1.274999976158142, "reward_std": 0.7401736974716187, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.01383508577753182, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.00023192215940071037, "learning_rate": 9.096496619545175e-07, "loss": 0.0, "num_tokens": 669133.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 486.375, "completions/mean_terminated_length": 486.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.014019553587898912, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.00021238852195892832, "learning_rate": 9.21942224953903e-07, "loss": 0.0, "num_tokens": 681952.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 622.875, "completions/mean_terminated_length": 622.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.014204021398266002, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0003088483390456531, "learning_rate": 9.342347879532883e-07, "loss": 0.0, "num_tokens": 694143.0, "reward": 0.6072115302085876, "reward_std": 0.7443203926086426, "rewards/fixed_code_pass_all_test_reward/mean": 0.19471153616905212, "rewards/fixed_code_pass_all_test_reward/std": 0.31740131974220276, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03750000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07440238445997238, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 414.125, "completions/mean_terminated_length": 414.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.014388489208633094, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.00019433833040238824, "learning_rate": 9.465273509526737e-07, "loss": 0.0, "num_tokens": 705864.0, "reward": 1.7954363822937012, "reward_std": 0.48687517642974854, "rewards/fixed_code_pass_all_test_reward/mean": 0.347222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.3682146370410919, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4482142925262451, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2343575805425644, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 669.875, "completions/mean_terminated_length": 473.0000305175781, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.014572957019000184, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.0001933006387844216, "learning_rate": 9.58819913952059e-07, "loss": 0.0, "num_tokens": 717951.0, "reward": 1.374358892440796, "reward_std": 0.6861400008201599, "rewards/fixed_code_pass_all_test_reward/mean": 0.307692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.43025872111320496, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.19166667759418488, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23213982582092285, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 310.0, "completions/mean_terminated_length": 310.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.014757424829367276, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.00019329361657582922, "learning_rate": 9.711124769514444e-07, "loss": 0.0, "num_tokens": 724599.0, "reward": 0.2724999785423279, "reward_std": 0.4863787591457367, "rewards/fixed_code_pass_all_test_reward/mean": 0.022499997168779373, "rewards/fixed_code_pass_all_test_reward/std": 0.029154758900403976, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 644.875, "completions/mean_terminated_length": 644.875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.014941892639734366, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.0002033311875493382, "learning_rate": 9.834050399508298e-07, "loss": 0.0, "num_tokens": 734910.0, "reward": 0.007352941203862429, "reward_std": 0.020797258242964745, "rewards/fixed_code_pass_all_test_reward/mean": 0.007352941203862429, "rewards/fixed_code_pass_all_test_reward/std": 0.020797260105609894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 778.75, "completions/mean_terminated_length": 778.75, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.015126360450101458, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.00021693740472983336, "learning_rate": 9.956976029502152e-07, "loss": 0.0, "num_tokens": 747204.0, "reward": 0.5714285373687744, "reward_std": 0.5506423115730286, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.1322600245475769, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 393.375, "completions/mean_terminated_length": 393.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.015310828260468548, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.0002189421365983435, "learning_rate": 1.0079901659496005e-06, "loss": 0.0, "num_tokens": 758223.0, "reward": 0.6624999642372131, "reward_std": 0.5527528524398804, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03750000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07440237700939178, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 575.875, "completions/mean_terminated_length": 575.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.01549529607083564, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.00026624123711371794, "learning_rate": 1.0202827289489859e-06, "loss": 0.0, "num_tokens": 770270.0, "reward": 0.6068181991577148, "reward_std": 0.6360992193222046, "rewards/fixed_code_pass_all_test_reward/mean": 0.4318181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.3879835903644562, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.01567976388120273, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.00016268007766484516, "learning_rate": 1.0325752919483712e-06, "loss": 0.0, "num_tokens": 781007.0, "reward": 1.2414352893829346, "reward_std": 0.5509029030799866, "rewards/fixed_code_pass_all_test_reward/mean": 0.24768519401550293, "rewards/fixed_code_pass_all_test_reward/std": 0.2882399260997772, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11875000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1066954955458641, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 391.625, "completions/mean_terminated_length": 391.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.01586423169156982, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0001485683078499278, "learning_rate": 1.0448678549477568e-06, "loss": 0.0, "num_tokens": 787508.0, "reward": 1.619805097579956, "reward_std": 0.853951096534729, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1510552018880844, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12871421873569489, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.016048699501936912, "frac_reward_zero_std": 0.0, "grad_norm": 4.125, "kl": 0.00034552612669358496, "learning_rate": 1.057160417947142e-06, "loss": 0.0, "num_tokens": 791230.0, "reward": 1.0526785850524902, "reward_std": 0.44148796796798706, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17767857015132904, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13837023079395294, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 402.0, "completions/mean_terminated_length": 402.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.016233167312304002, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.00014189577404977172, "learning_rate": 1.0694529809465275e-06, "loss": 0.0, "num_tokens": 798150.0, "reward": 1.7916667461395264, "reward_std": 0.9931111931800842, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666567325592, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.28549596667289734, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 487.125, "completions/mean_terminated_length": 487.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.016417635122671095, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.000178424124896992, "learning_rate": 1.0817455439459127e-06, "loss": 0.0, "num_tokens": 807743.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 486.375, "completions/mean_terminated_length": 486.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.016602102933038185, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.00018611558334669098, "learning_rate": 1.0940381069452983e-06, "loss": 0.0, "num_tokens": 820514.0, "reward": 1.1594297885894775, "reward_std": 0.34879007935523987, "rewards/fixed_code_pass_all_test_reward/mean": 0.10526315867900848, "rewards/fixed_code_pass_all_test_reward/std": 0.15344610810279846, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17916667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17084059119224548, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 383.75, "completions/mean_terminated_length": 383.75, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.016786570743405275, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.00017570182399140322, "learning_rate": 1.1063306699446834e-06, "loss": 0.0, "num_tokens": 830712.0, "reward": 1.3242807388305664, "reward_std": 0.7701340317726135, "rewards/fixed_code_pass_all_test_reward/mean": 0.3645833432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.4816833734512329, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0846974179148674, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1061050072312355, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 717.375, "completions/mean_terminated_length": 717.375, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.016971038553772366, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.00024746929557295516, "learning_rate": 1.118623232944069e-06, "loss": 0.0, "num_tokens": 845435.0, "reward": 0.5, "reward_std": 0.6928203105926514, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1832250952720642, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 308.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.01715550636413946, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.00026283981878805207, "learning_rate": 1.1309157959434544e-06, "loss": 0.0, "num_tokens": 850842.0, "reward": 1.7125000953674316, "reward_std": 0.674404501914978, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3374999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27613404393196106, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 341.375, "completions/mean_terminated_length": 341.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.01733997417450655, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.000265976729679096, "learning_rate": 1.1432083589428397e-06, "loss": 0.0, "num_tokens": 857821.0, "reward": 0.7019230723381042, "reward_std": 0.6609983444213867, "rewards/fixed_code_pass_all_test_reward/mean": 0.32692310214042664, "rewards/fixed_code_pass_all_test_reward/std": 0.31246304512023926, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1058.0, "completions/max_terminated_length": 1058.0, "completions/mean_length": 506.125, "completions/mean_terminated_length": 506.125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.01752444198487364, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.0002167678767364123, "learning_rate": 1.155500921942225e-06, "loss": 0.0, "num_tokens": 866942.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 353.875, "completions/mean_terminated_length": 353.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.01770890979524073, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.00021087101868033642, "learning_rate": 1.1677934849416105e-06, "loss": 0.0, "num_tokens": 876965.0, "reward": 1.3822917938232422, "reward_std": 0.6589198708534241, "rewards/fixed_code_pass_all_test_reward/mean": 0.3499999940395355, "rewards/fixed_code_pass_all_test_reward/std": 0.2828426957130432, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15729166567325592, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18853221833705902, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 325.375, "completions/mean_terminated_length": 325.375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.017893377605607823, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0001595845578776789, "learning_rate": 1.1800860479409958e-06, "loss": 0.0, "num_tokens": 887752.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.018077845415974913, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.0002337517489650054, "learning_rate": 1.1923786109403812e-06, "loss": 0.0, "num_tokens": 892437.0, "reward": 0.6499999761581421, "reward_std": 0.5424810647964478, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 562.625, "completions/mean_terminated_length": 562.625, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.018262313226342003, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.0002320545718248468, "learning_rate": 1.2046711739397666e-06, "loss": 0.0, "num_tokens": 901938.0, "reward": 0.9838709831237793, "reward_std": 0.6657989621162415, "rewards/fixed_code_pass_all_test_reward/mean": 0.2338709533214569, "rewards/fixed_code_pass_all_test_reward/std": 0.3088073432445526, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.018446781036709093, "frac_reward_zero_std": 0.0, "grad_norm": 7.6875, "kl": 0.00024526717970729806, "learning_rate": 1.216963736939152e-06, "loss": 0.0, "num_tokens": 906642.0, "reward": 0.9249999523162842, "reward_std": 0.7005100250244141, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 368.75, "completions/mean_terminated_length": 368.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.018631248847076187, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.00031117270555114374, "learning_rate": 1.2292562999385373e-06, "loss": 0.0, "num_tokens": 915032.0, "reward": 1.1041666269302368, "reward_std": 0.8113163113594055, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2291666567325592, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4266657531261444, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 337.0, "completions/mean_terminated_length": 337.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.018815716657443277, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.00027128136753162835, "learning_rate": 1.2415488629379227e-06, "loss": 0.0, "num_tokens": 924408.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1220.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 503.75, "completions/mean_terminated_length": 503.75, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.019000184467810367, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0003145008140563732, "learning_rate": 1.253841425937308e-06, "loss": 0.0, "num_tokens": 932942.0, "reward": 0.625, "reward_std": 0.5634361505508423, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022613286972, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.019184652278177457, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.00018538415315560997, "learning_rate": 1.2661339889366934e-06, "loss": 0.0, "num_tokens": 938034.0, "reward": 1.623809576034546, "reward_std": 0.8211174607276917, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2488095462322235, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17856916785240173, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 497.875, "completions/mean_terminated_length": 497.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.01936912008854455, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.00011359906329744263, "learning_rate": 1.278426551936079e-06, "loss": 0.0, "num_tokens": 945697.0, "reward": 1.649999976158142, "reward_std": 0.9995236992835999, "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.43129101395606995, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4208333492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36206531524658203, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 643.75, "completions/mean_terminated_length": 443.14288330078125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.01955358789891164, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.000346743485351908, "learning_rate": 1.2907191149354641e-06, "loss": 0.0, "num_tokens": 954079.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 574.25, "completions/mean_terminated_length": 574.25, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.01973805570927873, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.00027589151068241335, "learning_rate": 1.3030116779348495e-06, "loss": 0.0, "num_tokens": 966041.0, "reward": 1.3653318881988525, "reward_std": 0.41499194502830505, "rewards/fixed_code_pass_all_test_reward/mean": 0.3863636553287506, "rewards/fixed_code_pass_all_test_reward/std": 0.3063669204711914, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10396825522184372, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09342183917760849, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 849.875, "completions/mean_terminated_length": 450.5, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.01992252351964582, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.00017043683692463674, "learning_rate": 1.3153042409342348e-06, "loss": 0.0, "num_tokens": 979288.0, "reward": 0.7092633843421936, "reward_std": 0.769854724407196, "rewards/fixed_code_pass_all_test_reward/mean": 0.31640625, "rewards/fixed_code_pass_all_test_reward/std": 0.34331852197647095, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.01785714365541935, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.05050762742757797, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 477.875, "completions/mean_terminated_length": 477.875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.020106991330012914, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0002182160587835824, "learning_rate": 1.3275968039336204e-06, "loss": 0.0, "num_tokens": 990583.0, "reward": 1.7322742938995361, "reward_std": 0.6190869212150574, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.23227424919605255, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.179159015417099, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 431.0, "completions/mean_terminated_length": 431.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.020291459140380004, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.0001517423334007617, "learning_rate": 1.3398893669330058e-06, "loss": 0.0, "num_tokens": 998319.0, "reward": 1.4685440063476562, "reward_std": 0.7358102202415466, "rewards/fixed_code_pass_all_test_reward/mean": 0.7589285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.32829955220222473, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08461538702249527, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08335728198289871, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 574.25, "completions/mean_terminated_length": 574.25, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.020475926950747094, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.00018137613142243936, "learning_rate": 1.352181929932391e-06, "loss": 0.0, "num_tokens": 1009177.0, "reward": 1.8958333730697632, "reward_std": 0.5034602284431458, "rewards/fixed_code_pass_all_test_reward/mean": 0.7875000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.29489707946777344, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.23333334922790527, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2007920891046524, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 472.625, "completions/mean_terminated_length": 472.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.020660394761114184, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.00021990421828377293, "learning_rate": 1.3644744929317763e-06, "loss": 0.0, "num_tokens": 1019526.0, "reward": 1.2972756624221802, "reward_std": 0.48833411931991577, "rewards/fixed_code_pass_all_test_reward/mean": 0.18269231915473938, "rewards/fixed_code_pass_all_test_reward/std": 0.12462963908910751, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2395833432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22288468480110168, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 712.375, "completions/mean_terminated_length": 712.375, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.020844862571481278, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.00023685753149038646, "learning_rate": 1.3767670559311619e-06, "loss": 0.0, "num_tokens": 1035617.0, "reward": 0.65625, "reward_std": 0.5499594211578369, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.021029330381848368, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.00022843244551040698, "learning_rate": 1.3890596189305472e-06, "loss": 0.0, "num_tokens": 1042831.0, "reward": 1.1458332538604736, "reward_std": 0.8330356478691101, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02083333395421505, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 528.25, "completions/mean_terminated_length": 528.25, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.021213798192215458, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.0001968885972019052, "learning_rate": 1.4013521819299326e-06, "loss": 0.0, "num_tokens": 1053817.0, "reward": 1.0625, "reward_std": 0.5629958510398865, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 595.5, "completions/mean_terminated_length": 595.5, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.021398266002582548, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.00020954668616468553, "learning_rate": 1.4136447449293178e-06, "loss": 0.0, "num_tokens": 1066269.0, "reward": 0.765625, "reward_std": 0.4745180904865265, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.015625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.02158273381294964, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.0002731509794102749, "learning_rate": 1.4259373079287033e-06, "loss": 0.0, "num_tokens": 1071954.0, "reward": 1.6995534896850586, "reward_std": 0.7539024353027344, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07455357164144516, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08965104818344116, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 456.25, "completions/mean_terminated_length": 456.25, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.02176720162331673, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.00019865601643687114, "learning_rate": 1.4382298709280887e-06, "loss": 0.0, "num_tokens": 1080724.0, "reward": 0.6749999523162842, "reward_std": 0.5599744915962219, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.21380899846553802, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 385.125, "completions/mean_terminated_length": 385.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.02195166943368382, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0002011137275985675, "learning_rate": 1.450522433927474e-06, "loss": 0.0, "num_tokens": 1086941.0, "reward": 1.125, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 607.875, "completions/mean_terminated_length": 607.875, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.02213613724405091, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.00019001850296263, "learning_rate": 1.4628149969268592e-06, "loss": 0.0, "num_tokens": 1097596.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 385.625, "completions/mean_terminated_length": 385.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.022320605054418005, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.00022870255634188652, "learning_rate": 1.4751075599262448e-06, "loss": 0.0, "num_tokens": 1104937.0, "reward": 1.5322915315628052, "reward_std": 0.8685696721076965, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4072916805744171, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.32081979513168335, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 889.5, "completions/mean_terminated_length": 724.0000610351562, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.022505072864785095, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.00022206033645488787, "learning_rate": 1.4874001229256302e-06, "loss": 0.0, "num_tokens": 1118805.0, "reward": 0.6475961208343506, "reward_std": 0.6922891736030579, "rewards/fixed_code_pass_all_test_reward/mean": 0.09134615957736969, "rewards/fixed_code_pass_all_test_reward/std": 0.17315322160720825, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05624999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10500850528478622, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 401.375, "completions/mean_terminated_length": 401.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.022689540675152185, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.000266693719822797, "learning_rate": 1.4996926859250155e-06, "loss": 0.0, "num_tokens": 1130272.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 839.125, "completions/mean_terminated_length": 839.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.022874008485519275, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.00021368268426158465, "learning_rate": 1.5119852489244009e-06, "loss": 0.0, "num_tokens": 1144953.0, "reward": 0.9993451833724976, "reward_std": 0.42599624395370483, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12434523552656174, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1447259783744812, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.02305847629588637, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.00024347108228539582, "learning_rate": 1.524277811923786e-06, "loss": 0.0, "num_tokens": 1149393.0, "reward": 1.375, "reward_std": 1.0498299598693848, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36645016074180603, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 548.375, "completions/mean_terminated_length": 548.375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.02324294410625346, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.00017617909361433703, "learning_rate": 1.5365703749231716e-06, "loss": 0.0, "num_tokens": 1162116.0, "reward": 2.309253215789795, "reward_std": 0.3638772666454315, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.35456061363220215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4967532157897949, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09789485484361649, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 659.625, "completions/mean_terminated_length": 659.625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.02342741191662055, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.00025627506511227693, "learning_rate": 1.548862937922557e-06, "loss": 0.0, "num_tokens": 1175377.0, "reward": 0.8125, "reward_std": 0.7039429545402527, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 554.875, "completions/mean_terminated_length": 554.875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.02361187972698764, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.0002866256418201374, "learning_rate": 1.5611555009219423e-06, "loss": 0.0, "num_tokens": 1184480.0, "reward": 0.43269234895706177, "reward_std": 0.5001056790351868, "rewards/fixed_code_pass_all_test_reward/mean": 0.18269231915473938, "rewards/fixed_code_pass_all_test_reward/std": 0.09136422723531723, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 454.25, "completions/mean_terminated_length": 454.25, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.023796347537354733, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.00020806512475246564, "learning_rate": 1.5734480639213275e-06, "loss": 0.0, "num_tokens": 1193618.0, "reward": 1.053125023841858, "reward_std": 0.7764890789985657, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17812499403953552, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16444577276706696, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 557.0, "completions/mean_terminated_length": 557.0, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.023980815347721823, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.0002732718448896776, "learning_rate": 1.5857406269207133e-06, "loss": 0.0, "num_tokens": 1204762.0, "reward": 0.8944444060325623, "reward_std": 0.7914445400238037, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.14444443583488464, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1574712097644806, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 379.75, "completions/mean_terminated_length": 379.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.024165283158088913, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.00022027229533705395, "learning_rate": 1.5980331899200984e-06, "loss": 0.0, "num_tokens": 1213616.0, "reward": 2.249427318572998, "reward_std": 0.3405623733997345, "rewards/fixed_code_pass_all_test_reward/mean": 0.7528408765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.1941581517457962, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4965863823890686, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2081645280122757, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 375.0, "completions/mean_terminated_length": 375.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.024349750968456003, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.0002254446499136975, "learning_rate": 1.6103257529194838e-06, "loss": 0.0, "num_tokens": 1219552.0, "reward": 1.5978631973266602, "reward_std": 0.9484922885894775, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22286325693130493, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24379542469978333, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 581.5, "completions/mean_terminated_length": 581.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.024534218778823096, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.00021928165642748354, "learning_rate": 1.6226183159188692e-06, "loss": 0.0, "num_tokens": 1230500.0, "reward": 1.03397536277771, "reward_std": 0.9216347932815552, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1589752584695816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20992493629455566, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 546.25, "completions/mean_terminated_length": 546.25, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.024718686589190186, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.0002163659555662889, "learning_rate": 1.6349108789182547e-06, "loss": 0.0, "num_tokens": 1244254.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 857.5, "completions/mean_terminated_length": 857.5, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.024903154399557276, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.00017867935639515053, "learning_rate": 1.6472034419176399e-06, "loss": 0.0, "num_tokens": 1261506.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 661.125, "completions/mean_terminated_length": 661.125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.025087622209924366, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.0002373600691498723, "learning_rate": 1.6594960049170253e-06, "loss": 0.0, "num_tokens": 1273819.0, "reward": 0.9624999761581421, "reward_std": 0.947081983089447, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 660.375, "completions/mean_terminated_length": 660.375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.02527209002029146, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.00015512617756030522, "learning_rate": 1.6717885679164106e-06, "loss": 0.0, "num_tokens": 1284638.0, "reward": 0.8451389074325562, "reward_std": 0.7524237036705017, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09513889253139496, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10851887613534927, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 754.5, "completions/mean_terminated_length": 569.7142944335938, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.02545655783065855, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.0002561676756158704, "learning_rate": 1.6840811309157962e-06, "loss": 0.0, "num_tokens": 1297962.0, "reward": 0.9626436829566956, "reward_std": 0.7794632315635681, "rewards/fixed_code_pass_all_test_reward/mean": 0.44181033968925476, "rewards/fixed_code_pass_all_test_reward/std": 0.39745429158210754, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02083333395421505, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.02564102564102564, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.0002786101740639424, "learning_rate": 1.6963736939151816e-06, "loss": 0.0, "num_tokens": 1304368.0, "reward": 1.5604166984558105, "reward_std": 0.8124985098838806, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06041666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14027680456638336, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 429.375, "completions/mean_terminated_length": 429.375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.025825493451392734, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.00020269198648747988, "learning_rate": 1.7086662569145667e-06, "loss": 0.0, "num_tokens": 1314627.0, "reward": 1.3254659175872803, "reward_std": 0.5633125901222229, "rewards/fixed_code_pass_all_test_reward/mean": 0.40760868787765503, "rewards/fixed_code_pass_all_test_reward/std": 0.2191697508096695, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04285714402794838, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08081221580505371, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.026009961261759824, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.00019266623166913632, "learning_rate": 1.720958819913952e-06, "loss": 0.0, "num_tokens": 1319146.0, "reward": 0.8805555701255798, "reward_std": 0.7448292970657349, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25555557012557983, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.26051098108291626, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1224.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 497.0, "completions/mean_terminated_length": 497.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.026194429072126914, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.0004085133741682512, "learning_rate": 1.7332513829133377e-06, "loss": 0.0, "num_tokens": 1328082.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 585.625, "completions/mean_terminated_length": 376.71429443359375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.026378896882494004, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.0002128674168488942, "learning_rate": 1.745543945912723e-06, "loss": 0.0, "num_tokens": 1338415.0, "reward": 1.1361607313156128, "reward_std": 0.9454864859580994, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2611607313156128, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35056930780410767, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.026563364692861097, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.00025897837622324005, "learning_rate": 1.7578365089121084e-06, "loss": 0.0, "num_tokens": 1343301.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.026747832503228187, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.0004298805424696184, "learning_rate": 1.7701290719114935e-06, "loss": 0.0, "num_tokens": 1347513.0, "reward": 2.549999952316284, "reward_std": 0.49856942892074585, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.800000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18516401946544647, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 586.625, "completions/mean_terminated_length": 586.625, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.026932300313595278, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.00022431595425587147, "learning_rate": 1.7824216349108791e-06, "loss": 0.0, "num_tokens": 1360742.0, "reward": 1.5817370414733887, "reward_std": 0.8122112154960632, "rewards/fixed_code_pass_all_test_reward/mean": 0.44602274894714355, "rewards/fixed_code_pass_all_test_reward/std": 0.4371573030948639, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2607142925262451, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3660322427749634, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.027116768123962368, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.0002590952544778702, "learning_rate": 1.7947141979102645e-06, "loss": 0.0, "num_tokens": 1369388.0, "reward": 1.5927083492279053, "reward_std": 0.7200011014938354, "rewards/fixed_code_pass_all_test_reward/mean": 0.5958333611488342, "rewards/fixed_code_pass_all_test_reward/std": 0.3856297731399536, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12187499552965164, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10643366724252701, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1176.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 391.625, "completions/mean_terminated_length": 391.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.02730123593432946, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.0003064548218389973, "learning_rate": 1.8070067609096498e-06, "loss": 0.0, "num_tokens": 1375417.0, "reward": 0.703125, "reward_std": 0.8289879560470581, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.078125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17598575353622437, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 137.125, "completions/mean_terminated_length": 137.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.02748570374469655, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.000404661547690921, "learning_rate": 1.819299323909035e-06, "loss": 0.0, "num_tokens": 1379330.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 470.125, "completions/mean_terminated_length": 470.125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.02767017155506364, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.0002806523407343775, "learning_rate": 1.8315918869084206e-06, "loss": 0.0, "num_tokens": 1387939.0, "reward": 0.53125, "reward_std": 0.5737953186035156, "rewards/fixed_code_pass_all_test_reward/mean": 0.03125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 422.125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.02785463936543073, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.0002353053387196269, "learning_rate": 1.843884449907806e-06, "loss": 0.0, "num_tokens": 1399236.0, "reward": 0.9408653974533081, "reward_std": 0.39512529969215393, "rewards/fixed_code_pass_all_test_reward/mean": 0.009615384973585606, "rewards/fixed_code_pass_all_test_reward/std": 0.017804233357310295, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05624999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10500850528478622, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 750.375, "completions/mean_terminated_length": 750.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.028039107175797825, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.00027518066599441227, "learning_rate": 1.8561770129071913e-06, "loss": 0.0, "num_tokens": 1413751.0, "reward": 1.2530303001403809, "reward_std": 0.678925096988678, "rewards/fixed_code_pass_all_test_reward/mean": 0.3863636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.5090677738189697, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1458418369293213, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.028223574986164915, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.0003075726326642325, "learning_rate": 1.8684695759065767e-06, "loss": 0.0, "num_tokens": 1418398.0, "reward": 1.212499976158142, "reward_std": 0.9203066229820251, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 454.375, "completions/mean_terminated_length": 454.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.028408042796532005, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0003413818621993414, "learning_rate": 1.8807621389059622e-06, "loss": 0.0, "num_tokens": 1427961.0, "reward": 0.6499999761581421, "reward_std": 0.5424810647964478, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 1093.0, "completions/mean_terminated_length": 1093.0, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.028592510606899095, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.00017650930749368854, "learning_rate": 1.8930547019053474e-06, "loss": 0.0, "num_tokens": 1443745.0, "reward": 0.8852941393852234, "reward_std": 0.8159737586975098, "rewards/fixed_code_pass_all_test_reward/mean": 0.3602941334247589, "rewards/fixed_code_pass_all_test_reward/std": 0.3213256001472473, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 415.5, "completions/mean_terminated_length": 415.5, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.02877697841726619, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.00020882244916720083, "learning_rate": 1.9053472649047328e-06, "loss": 0.0, "num_tokens": 1454813.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 411.75, "completions/mean_terminated_length": 411.75, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.02896144622763328, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.00038429802225437015, "learning_rate": 1.917639827904118e-06, "loss": 0.0, "num_tokens": 1463875.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 565.625, "completions/mean_terminated_length": 565.625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.02914591403800037, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0001970265902855317, "learning_rate": 1.9299323909035037e-06, "loss": 0.0, "num_tokens": 1475096.0, "reward": 1.3587464094161987, "reward_std": 0.6915855407714844, "rewards/fixed_code_pass_all_test_reward/mean": 0.5488505959510803, "rewards/fixed_code_pass_all_test_reward/std": 0.2344980686903, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0598958320915699, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08060835301876068, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 852.5, "completions/mean_terminated_length": 681.7142944335938, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.02933038184836746, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.0002130235670847469, "learning_rate": 1.942224953902889e-06, "loss": 0.0, "num_tokens": 1487756.0, "reward": 1.4921555519104004, "reward_std": 0.6669631004333496, "rewards/fixed_code_pass_all_test_reward/mean": 0.5765305757522583, "rewards/fixed_code_pass_all_test_reward/std": 0.345822274684906, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04062500223517418, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0686371773481369, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 627.875, "completions/mean_terminated_length": 627.875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.029514849658734552, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.0002820888112182729, "learning_rate": 1.9545175169022744e-06, "loss": 0.0, "num_tokens": 1499499.0, "reward": 1.4236111640930176, "reward_std": 1.0502781867980957, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2986111342906952, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3185858130455017, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 533.5, "completions/mean_terminated_length": 533.5, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.029699317469101642, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.0002438643250570749, "learning_rate": 1.9668100799016596e-06, "loss": 0.0, "num_tokens": 1509639.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 248.0, "completions/mean_terminated_length": 248.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.029883785279468732, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0002731181630224455, "learning_rate": 1.979102642901045e-06, "loss": 0.0, "num_tokens": 1515407.0, "reward": 0.5568182468414307, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.1818181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.030068253089835822, "frac_reward_zero_std": 0.0, "grad_norm": 3.96875, "kl": 0.0004077297417097725, "learning_rate": 1.9913952059004303e-06, "loss": 0.0, "num_tokens": 1521558.0, "reward": 1.1875, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 374.0, "completions/mean_terminated_length": 374.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.030252720900202916, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0004525748208834557, "learning_rate": 2.003687768899816e-06, "loss": 0.0, "num_tokens": 1528918.0, "reward": 1.515625, "reward_std": 1.034104824066162, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.140625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17086362838745117, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 397.375, "completions/mean_terminated_length": 397.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.030437188710570006, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0002576028609837522, "learning_rate": 2.015980331899201e-06, "loss": 0.0, "num_tokens": 1539089.0, "reward": 1.3597221374511719, "reward_std": 0.7666767239570618, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.49888765811920166, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20972222089767456, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29441824555397034, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 436.625, "completions/mean_terminated_length": 436.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.030621656520937096, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.00036805544004892, "learning_rate": 2.0282728948985866e-06, "loss": 0.0, "num_tokens": 1548126.0, "reward": 0.6666666269302368, "reward_std": 0.6283639669418335, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.19416078925132751, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 427.625, "completions/mean_terminated_length": 427.625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.030806124331304186, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.00029136257035133895, "learning_rate": 2.0405654578979718e-06, "loss": 0.0, "num_tokens": 1557267.0, "reward": 1.149999976158142, "reward_std": 0.8668497800827026, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.03099059214167128, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.0006196143785928143, "learning_rate": 2.0528580208973573e-06, "loss": 0.0, "num_tokens": 1561193.0, "reward": 1.6029514074325562, "reward_std": 0.9949027895927429, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10295139253139496, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12259691953659058, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 827.75, "completions/mean_terminated_length": 653.4285888671875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.03117505995203837, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109375, "kl": 0.00023428743497788673, "learning_rate": 2.0651505838967425e-06, "loss": 0.0, "num_tokens": 1574895.0, "reward": 1.126173496246338, "reward_std": 0.6114586591720581, "rewards/fixed_code_pass_all_test_reward/mean": 0.171875, "rewards/fixed_code_pass_all_test_reward/std": 0.3402828574180603, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07929842174053192, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11009665578603745, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 323.75, "completions/mean_terminated_length": 323.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.03135952776240546, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0004789967370015802, "learning_rate": 2.077443146896128e-06, "loss": 0.0, "num_tokens": 1581229.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 588.25, "completions/mean_terminated_length": 588.25, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.03154399557277255, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.00019811845777439885, "learning_rate": 2.0897357098955136e-06, "loss": 0.0, "num_tokens": 1592479.0, "reward": 1.5386905670166016, "reward_std": 0.3508371412754059, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.2920915186405182, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0386904776096344, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0719228982925415, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 240.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.03172846338313964, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.0004376200286060339, "learning_rate": 2.102028272894899e-06, "loss": 0.0, "num_tokens": 1597323.0, "reward": 1.536484956741333, "reward_std": 0.8145149946212769, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16148504614830017, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12858058512210846, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 382.375, "completions/mean_terminated_length": 382.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.03191293119350673, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.00034247084931848804, "learning_rate": 2.114320835894284e-06, "loss": 0.0, "num_tokens": 1604222.0, "reward": 2.06756854057312, "reward_std": 0.5326680541038513, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2175685316324234, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17070548236370087, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 703.375, "completions/mean_terminated_length": 703.375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.032097399003873824, "frac_reward_zero_std": 0.0, "grad_norm": 0.56640625, "kl": 0.00017431498963560443, "learning_rate": 2.1266133988936695e-06, "loss": 0.0, "num_tokens": 1621985.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 557.0, "completions/mean_terminated_length": 557.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.03228186681424092, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.000459586723081884, "learning_rate": 2.138905961893055e-06, "loss": 0.0, "num_tokens": 1634193.0, "reward": 1.7736111879348755, "reward_std": 0.8440672159194946, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.14861111342906952, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17747370898723602, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 837.125, "completions/mean_terminated_length": 664.1428833007812, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.032466334624608004, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.00028740140805894043, "learning_rate": 2.1511985248924403e-06, "loss": 0.0, "num_tokens": 1645666.0, "reward": 0.6875, "reward_std": 0.6512351036071777, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 474.625, "completions/mean_terminated_length": 474.625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.0326508024349751, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0003348938062117668, "learning_rate": 2.1634910878918254e-06, "loss": 0.0, "num_tokens": 1654023.0, "reward": 0.606249988079071, "reward_std": 0.6742389798164368, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10625000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21784251928329468, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 339.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.03283527024534219, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.0005127146450831788, "learning_rate": 2.175783650891211e-06, "loss": 0.0, "num_tokens": 1662722.0, "reward": 0.9249999523162842, "reward_std": 0.6584613919258118, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 486.875, "completions/mean_terminated_length": 486.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.03301973805570928, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.0002247001975774765, "learning_rate": 2.1880762138905966e-06, "loss": 0.0, "num_tokens": 1674545.0, "reward": 1.2527778148651123, "reward_std": 0.5902126431465149, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.14471891522407532, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12777778506278992, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18468716740608215, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 349.25, "completions/mean_terminated_length": 349.25, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.03320420586607637, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.0003258617534811492, "learning_rate": 2.2003687768899817e-06, "loss": 0.0, "num_tokens": 1680059.0, "reward": 1.162500023841858, "reward_std": 1.0336309671401978, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03750000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1060660257935524, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 489.5, "completions/mean_terminated_length": 489.5, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.03338867367644346, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.00041503602915327065, "learning_rate": 2.212661339889367e-06, "loss": 0.0, "num_tokens": 1688799.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 394.25, "completions/mean_terminated_length": 394.25, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.03357314148681055, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.00031451320683117956, "learning_rate": 2.2249539028887524e-06, "loss": 0.0, "num_tokens": 1698961.0, "reward": 1.3171581029891968, "reward_std": 0.5294672846794128, "rewards/fixed_code_pass_all_test_reward/mean": 0.09756097197532654, "rewards/fixed_code_pass_all_test_reward/std": 0.1316685974597931, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.34459707140922546, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1949939727783203, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 506.875, "completions/mean_terminated_length": 506.875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.033757609297177645, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.00030739402427570894, "learning_rate": 2.237246465888138e-06, "loss": 0.0, "num_tokens": 1710872.0, "reward": 1.3062500953674316, "reward_std": 0.5646981000900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18125000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13076014816761017, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.03394207710754473, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.00038252518061199225, "learning_rate": 2.249539028887523e-06, "loss": 0.0, "num_tokens": 1715877.0, "reward": 1.006250023841858, "reward_std": 0.7272440791130066, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13124999403953552, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15338443219661713, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 697.25, "completions/mean_terminated_length": 504.2857360839844, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.034126544917911825, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.0003728813426278066, "learning_rate": 2.2618315918869087e-06, "loss": 0.0, "num_tokens": 1725415.0, "reward": 1.1343750953674316, "reward_std": 0.9084855318069458, "rewards/fixed_code_pass_all_test_reward/mean": 0.484375, "rewards/fixed_code_pass_all_test_reward/std": 0.469790518283844, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15000000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17728105187416077, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 478.75, "completions/mean_terminated_length": 478.75, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.03431101272827892, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.00034587618392833974, "learning_rate": 2.274124154886294e-06, "loss": 0.0, "num_tokens": 1733925.0, "reward": 0.8897058963775635, "reward_std": 0.4140841066837311, "rewards/fixed_code_pass_all_test_reward/mean": 0.13970588147640228, "rewards/fixed_code_pass_all_test_reward/std": 0.10863599926233292, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 508.625, "completions/mean_terminated_length": 508.625, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.034495480538646005, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.00025921815358742606, "learning_rate": 2.2864167178856795e-06, "loss": 0.0, "num_tokens": 1742866.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 317.875, "completions/mean_terminated_length": 317.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.0346799483490131, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0003331234947836492, "learning_rate": 2.2987092808850646e-06, "loss": 0.0, "num_tokens": 1751417.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 364.25, "completions/mean_terminated_length": 364.25, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.034864416159380185, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0006526256802317221, "learning_rate": 2.31100184388445e-06, "loss": 0.0, "num_tokens": 1761051.0, "reward": 1.350000023841858, "reward_std": 0.8332380652427673, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22499999403953552, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24928469955921173, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 936.125, "completions/mean_terminated_length": 777.2857666015625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.03504888396974728, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421875, "kl": 0.000260414119111374, "learning_rate": 2.3232944068838354e-06, "loss": 0.0, "num_tokens": 1776820.0, "reward": 0.7124999761581421, "reward_std": 0.6010407209396362, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08750000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13562028110027313, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 429.75, "completions/mean_terminated_length": 429.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.03523335178011437, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0003187157562933862, "learning_rate": 2.335586969883221e-06, "loss": 0.0, "num_tokens": 1783450.0, "reward": 0.8178570866584778, "reward_std": 0.5092009902000427, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06785714626312256, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0788954347372055, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 436.25, "completions/mean_terminated_length": 436.25, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.03541781959048146, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.0006478614941443084, "learning_rate": 2.347879532882606e-06, "loss": 0.0, "num_tokens": 1794252.0, "reward": 1.0750000476837158, "reward_std": 0.799553394317627, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 608.0, "completions/mean_terminated_length": 402.2857360839844, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.03560228740084855, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0004322936474636663, "learning_rate": 2.3601720958819917e-06, "loss": 0.0, "num_tokens": 1805028.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 377.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.035786755211215646, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0004939322982409067, "learning_rate": 2.372464658881377e-06, "loss": 0.0, "num_tokens": 1810814.0, "reward": 0.8500000238418579, "reward_std": 0.7982122302055359, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2828427255153656, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 509.25, "completions/mean_terminated_length": 509.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.03597122302158273, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.0005289382152113831, "learning_rate": 2.3847572218807624e-06, "loss": 0.0, "num_tokens": 1819328.0, "reward": 0.9283602237701416, "reward_std": 0.6695959568023682, "rewards/fixed_code_pass_all_test_reward/mean": 0.27419355511665344, "rewards/fixed_code_pass_all_test_reward/std": 0.23829832673072815, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15416666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2513061463832855, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 569.5, "completions/mean_terminated_length": 569.5, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.036155690831949826, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.00036207529592502397, "learning_rate": 2.3970497848801475e-06, "loss": 0.0, "num_tokens": 1828788.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.03634015864231691, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.0002694746490305988, "learning_rate": 2.409342347879533e-06, "loss": 0.0, "num_tokens": 1834412.0, "reward": 1.4354166984558105, "reward_std": 0.6210870146751404, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18541666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.30282238125801086, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 457.25, "completions/mean_terminated_length": 457.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.036524626452684006, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.000572481396375224, "learning_rate": 2.4216349108789183e-06, "loss": 0.0, "num_tokens": 1843582.0, "reward": 1.462499976158142, "reward_std": 0.9527216553688049, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21250000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.338853120803833, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 440.75, "completions/mean_terminated_length": 440.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.0367090942630511, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.00045562447303382214, "learning_rate": 2.433927473878304e-06, "loss": 0.0, "num_tokens": 1851524.0, "reward": 1.0220588445663452, "reward_std": 0.6319375038146973, "rewards/fixed_code_pass_all_test_reward/mean": 0.6470588445663452, "rewards/fixed_code_pass_all_test_reward/std": 0.36261260509490967, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.036893562073418186, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0013324526130418235, "learning_rate": 2.4462200368776894e-06, "loss": 0.0001, "num_tokens": 1858007.0, "reward": 1.7218749523162842, "reward_std": 1.224412202835083, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.34687501192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4054180979728699, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 836.125, "completions/mean_terminated_length": 836.125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.03707802988378528, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.0003329765131638851, "learning_rate": 2.4585125998770746e-06, "loss": 0.0, "num_tokens": 1874360.0, "reward": 0.7276785373687744, "reward_std": 0.6510077118873596, "rewards/fixed_code_pass_all_test_reward/mean": 0.2276785671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.26789966225624084, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.03726249769415237, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.00031286675584851764, "learning_rate": 2.4708051628764597e-06, "loss": 0.0, "num_tokens": 1882427.0, "reward": 1.916517972946167, "reward_std": 0.6602216958999634, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.27894893288612366, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.44776785373687744, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2154131680727005, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.03744696550451946, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.0014937967534933705, "learning_rate": 2.4830977258758453e-06, "loss": 0.0001, "num_tokens": 1886570.0, "reward": 1.475000023841858, "reward_std": 0.8614771366119385, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21380899846553802, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 587.0, "completions/mean_terminated_length": 587.0, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.03763143331488655, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.00033350362537021283, "learning_rate": 2.495390288875231e-06, "loss": 0.0, "num_tokens": 1898538.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1869.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 765.25, "completions/mean_terminated_length": 765.25, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.03781590112525364, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.00034772823346429504, "learning_rate": 2.507682851874616e-06, "loss": 0.0, "num_tokens": 1909780.0, "reward": 0.856249988079071, "reward_std": 0.5678515434265137, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10625000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21784251928329468, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 627.25, "completions/mean_terminated_length": 627.25, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.03800036893562073, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.0005103383846289944, "learning_rate": 2.519975414874001e-06, "loss": 0.0, "num_tokens": 1921838.0, "reward": 1.0750000476837158, "reward_std": 0.5946187376976013, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 436.0, "completions/mean_terminated_length": 436.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.03818483674598783, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.00043009742148569785, "learning_rate": 2.5322679778733868e-06, "loss": 0.0, "num_tokens": 1928134.0, "reward": 1.0599205493927002, "reward_std": 1.0138577222824097, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.30992063879966736, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3496263921260834, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 324.5, "completions/mean_terminated_length": 324.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.03836930455635491, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.0011202831574337324, "learning_rate": 2.544560540872772e-06, "loss": 0.0, "num_tokens": 1936146.0, "reward": 0.8177083134651184, "reward_std": 0.5213389992713928, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0677083358168602, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13717946410179138, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 643.5, "completions/mean_terminated_length": 643.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.03855377236672201, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.0007639047635166207, "learning_rate": 2.556853103872158e-06, "loss": 0.0, "num_tokens": 1945190.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 849.25, "completions/mean_terminated_length": 849.25, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.0387382401770891, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.0005065084442321677, "learning_rate": 2.569145666871543e-06, "loss": 0.0, "num_tokens": 1958400.0, "reward": 0.7749999761581421, "reward_std": 0.48329225182533264, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.03892270798745619, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.0006058130311430432, "learning_rate": 2.5814382298709282e-06, "loss": 0.0, "num_tokens": 1965013.0, "reward": 1.1749999523162842, "reward_std": 0.5897941589355469, "rewards/fixed_code_pass_all_test_reward/mean": 0.42500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.24928469955921173, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 406.5, "completions/mean_terminated_length": 406.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.03910717579782328, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.000726219434000086, "learning_rate": 2.593730792870314e-06, "loss": 0.0, "num_tokens": 1972673.0, "reward": 0.45192307233810425, "reward_std": 0.7079281806945801, "rewards/fixed_code_pass_all_test_reward/mean": 0.32692307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.42878273129463196, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 396.125, "completions/mean_terminated_length": 396.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.039291643608190374, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.0005511300223588478, "learning_rate": 2.606023355869699e-06, "loss": 0.0, "num_tokens": 1980162.0, "reward": 0.7931034564971924, "reward_std": 0.5196985602378845, "rewards/fixed_code_pass_all_test_reward/mean": 0.16810345649719238, "rewards/fixed_code_pass_all_test_reward/std": 0.0769679844379425, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 479.5, "completions/mean_terminated_length": 479.5, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.03947611141855746, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.00020058286281710025, "learning_rate": 2.6183159188690845e-06, "loss": 0.0, "num_tokens": 1988206.0, "reward": 1.9500993490219116, "reward_std": 0.6377547383308411, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3000992238521576, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20927132666110992, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 1264.5, "completions/mean_terminated_length": 1003.3333740234375, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 0.039660579228924554, "frac_reward_zero_std": 0.0, "grad_norm": 0.640625, "kl": 0.00015692944498368888, "learning_rate": 2.6306084818684697e-06, "loss": 0.0, "num_tokens": 2008810.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.03984504703929164, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.0007798120204824954, "learning_rate": 2.642901044867855e-06, "loss": 0.0, "num_tokens": 2015104.0, "reward": 0.8374999761581421, "reward_std": 0.49785250425338745, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.34557148814201355, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05624999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10500850528478622, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 235.375, "completions/mean_terminated_length": 235.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.040029514849658734, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.00036840429311268963, "learning_rate": 2.655193607867241e-06, "loss": 0.0, "num_tokens": 2019899.0, "reward": 1.53125, "reward_std": 0.5324723720550537, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.28125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2103356271982193, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 278.25, "completions/mean_terminated_length": 278.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.04021398266002583, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.000419481879362138, "learning_rate": 2.667486170866626e-06, "loss": 0.0, "num_tokens": 2030021.0, "reward": 2.71423602104187, "reward_std": 0.4204050898551941, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8392361402511597, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2013208121061325, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 508.625, "completions/mean_terminated_length": 508.625, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.040398450470392915, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.00041335527021146845, "learning_rate": 2.6797787338660116e-06, "loss": 0.0, "num_tokens": 2040786.0, "reward": 1.7724359035491943, "reward_std": 0.24226070940494537, "rewards/fixed_code_pass_all_test_reward/mean": 0.7307692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.18388132750988007, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 661.375, "completions/mean_terminated_length": 661.375, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.04058291828076001, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.00027158146076544654, "learning_rate": 2.6920712968653967e-06, "loss": 0.0, "num_tokens": 2052445.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 644.0, "completions/mean_terminated_length": 443.4285888671875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.0407673860911271, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.0002703534373722505, "learning_rate": 2.704363859864782e-06, "loss": 0.0, "num_tokens": 2063741.0, "reward": 0.8562500476837158, "reward_std": 0.5368143320083618, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10624999552965164, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11475907266139984, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 259.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.04095185390149419, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.0005723480535380077, "learning_rate": 2.7166564228641674e-06, "loss": 0.0, "num_tokens": 2068697.0, "reward": 1.6916667222976685, "reward_std": 0.6396303772926331, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3166666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20314979553222656, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 324.5, "completions/mean_terminated_length": 324.5, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.04113632171186128, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.00046552025742130354, "learning_rate": 2.7289489858635526e-06, "loss": 0.0, "num_tokens": 2074317.0, "reward": 0.7000000476837158, "reward_std": 0.5855400562286377, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.04132078952222837, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.0006259580113692209, "learning_rate": 2.7412415488629377e-06, "loss": 0.0, "num_tokens": 2078976.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.04150525733259546, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.00034428946673870087, "learning_rate": 2.7535341118623237e-06, "loss": 0.0, "num_tokens": 2084180.0, "reward": 1.6199405193328857, "reward_std": 0.6888697147369385, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16160716116428375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21513226628303528, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 412.75, "completions/mean_terminated_length": 412.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.041689725142962555, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.0015161023438849952, "learning_rate": 2.765826674861709e-06, "loss": 0.0001, "num_tokens": 2092698.0, "reward": 0.8062499761581421, "reward_std": 0.507400631904602, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05624999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10500850528478622, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 440.0, "completions/mean_terminated_length": 440.0, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.04187419295332964, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.00042658077290980145, "learning_rate": 2.7781192378610945e-06, "loss": 0.0, "num_tokens": 2102706.0, "reward": 0.9406249523162842, "reward_std": 0.39049819111824036, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06562499701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09348176419734955, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.042058660763696736, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "kl": 0.0018624804542923812, "learning_rate": 2.7904118008604796e-06, "loss": 0.0001, "num_tokens": 2107073.0, "reward": 1.1749999523162842, "reward_std": 0.7285014390945435, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 645.125, "completions/mean_terminated_length": 645.125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.04224312857406383, "frac_reward_zero_std": 0.0, "grad_norm": 0.71484375, "kl": 0.00027368878727429546, "learning_rate": 2.802704363859865e-06, "loss": 0.0, "num_tokens": 2121962.0, "reward": 1.7616994380950928, "reward_std": 0.7162973880767822, "rewards/fixed_code_pass_all_test_reward/mean": 0.8460743427276611, "rewards/fixed_code_pass_all_test_reward/std": 0.3430623412132263, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04062499850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07784772664308548, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 451.125, "completions/mean_terminated_length": 451.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.042427596384430916, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.00030411591524170944, "learning_rate": 2.8149969268592504e-06, "loss": 0.0, "num_tokens": 2132115.0, "reward": 1.7142857313156128, "reward_std": 0.5150787830352783, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0892857164144516, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25253814458847046, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 475.0, "completions/mean_terminated_length": 475.0, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.04261206419479801, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.0003990456316387281, "learning_rate": 2.8272894898586355e-06, "loss": 0.0, "num_tokens": 2140299.0, "reward": 1.2272727489471436, "reward_std": 0.3391773998737335, "rewards/fixed_code_pass_all_test_reward/mean": 0.28977274894714355, "rewards/fixed_code_pass_all_test_reward/std": 0.2342280000448227, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0916125476360321, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 608.875, "completions/mean_terminated_length": 608.875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.042796532005165096, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0003711564231707598, "learning_rate": 2.839582052858021e-06, "loss": 0.0, "num_tokens": 2155770.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 461.25, "completions/mean_terminated_length": 461.25, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.04298099981553219, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0005496324483829085, "learning_rate": 2.8518746158574067e-06, "loss": 0.0, "num_tokens": 2166772.0, "reward": 0.8704023361206055, "reward_std": 0.6603036522865295, "rewards/fixed_code_pass_all_test_reward/mean": 0.23706895112991333, "rewards/fixed_code_pass_all_test_reward/std": 0.2553570866584778, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.008333333767950535, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0235702283680439, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 687.625, "completions/mean_terminated_length": 687.625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.04316546762589928, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.00041242474981117994, "learning_rate": 2.8641671788567922e-06, "loss": 0.0, "num_tokens": 2180457.0, "reward": 1.4175477027893066, "reward_std": 0.6038233637809753, "rewards/fixed_code_pass_all_test_reward/mean": 0.2063492238521576, "rewards/fixed_code_pass_all_test_reward/std": 0.27348482608795166, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.33619844913482666, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2600438892841339, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 406.125, "completions/mean_terminated_length": 406.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.04334993543626637, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.00032419562649010913, "learning_rate": 2.8764597418561774e-06, "loss": 0.0, "num_tokens": 2186634.0, "reward": 1.5437500476837158, "reward_std": 0.912389874458313, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.29374998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2678452432155609, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.04353440324663346, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0008236439862230327, "learning_rate": 2.8887523048555625e-06, "loss": 0.0, "num_tokens": 2193582.0, "reward": 0.9812499284744263, "reward_std": 0.4366573393344879, "rewards/fixed_code_pass_all_test_reward/mean": 0.48125001788139343, "rewards/fixed_code_pass_all_test_reward/std": 0.3127498924732208, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.043718871057000556, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0010170043824473396, "learning_rate": 2.901044867854948e-06, "loss": 0.0, "num_tokens": 2202412.0, "reward": 1.0499999523162842, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.04390333886736764, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.002208568032074254, "learning_rate": 2.9133374308543333e-06, "loss": 0.0001, "num_tokens": 2206394.0, "reward": 1.4249999523162842, "reward_std": 1.2162588834762573, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.30000001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.42761799693107605, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.04408780667773474, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.0007820182108844165, "learning_rate": 2.9256299938537184e-06, "loss": 0.0, "num_tokens": 2215483.0, "reward": 1.5214284658432007, "reward_std": 0.9251585006713867, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.27142858505249023, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3331972658634186, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 538.125, "completions/mean_terminated_length": 538.125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.04427227448810182, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0010011577396653593, "learning_rate": 2.937922556853104e-06, "loss": 0.0, "num_tokens": 2224396.0, "reward": 0.8636363744735718, "reward_std": 0.3870314955711365, "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363446712494, "rewards/fixed_code_pass_all_test_reward/std": 0.378549724817276, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 490.125, "completions/mean_terminated_length": 490.125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.04445674229846892, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.0009147427772404626, "learning_rate": 2.9502151198524896e-06, "loss": 0.0, "num_tokens": 2232653.0, "reward": 0.8416666388511658, "reward_std": 0.4913348853588104, "rewards/fixed_code_pass_all_test_reward/mean": 0.0416666679084301, "rewards/fixed_code_pass_all_test_reward/std": 0.023735923692584038, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 161.125, "completions/mean_terminated_length": 161.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.04464121010883601, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.0030622983740613563, "learning_rate": 2.962507682851875e-06, "loss": 0.0001, "num_tokens": 2237462.0, "reward": 1.7048611640930176, "reward_std": 0.9342278838157654, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2048611044883728, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25927993655204773, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 398.625, "completions/mean_terminated_length": 398.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.0448256779192031, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.0015167073288466781, "learning_rate": 2.9748002458512603e-06, "loss": 0.0001, "num_tokens": 2248219.0, "reward": 0.8340277671813965, "reward_std": 0.5252901911735535, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08402777463197708, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11673986911773682, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 453.0, "completions/mean_terminated_length": 453.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.04501014572957019, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0007396207001875155, "learning_rate": 2.987092808850646e-06, "loss": 0.0, "num_tokens": 2258163.0, "reward": 1.6447917222976685, "reward_std": 0.7945365905761719, "rewards/fixed_code_pass_all_test_reward/mean": 0.6197916269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.5134295225143433, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.14999999105930328, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14880476891994476, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 227.25, "completions/mean_terminated_length": 227.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.045194613539937284, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0008829107791825663, "learning_rate": 2.999385371850031e-06, "loss": 0.0, "num_tokens": 2266517.0, "reward": 1.2869048118591309, "reward_std": 0.6587564945220947, "rewards/fixed_code_pass_all_test_reward/mean": 0.3869047462940216, "rewards/fixed_code_pass_all_test_reward/std": 0.4682587683200836, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 395.75, "completions/mean_terminated_length": 395.75, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.04537908135030437, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0012675870348175522, "learning_rate": 3.011677934849416e-06, "loss": 0.0001, "num_tokens": 2276651.0, "reward": 1.321428656578064, "reward_std": 0.14787118136882782, "rewards/fixed_code_pass_all_test_reward/mean": 0.3214285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.147871196269989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 343.25, "completions/mean_terminated_length": 343.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.045563549160671464, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.0006958170197322033, "learning_rate": 3.0239704978488018e-06, "loss": 0.0, "num_tokens": 2286093.0, "reward": 1.5, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 297.125, "completions/mean_terminated_length": 297.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.04574801697103855, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.00021461331198224798, "learning_rate": 3.036263060848187e-06, "loss": 0.0, "num_tokens": 2292214.0, "reward": 1.6895833015441895, "reward_std": 0.614567756652832, "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.43129101395606995, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.33541667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29228675365448, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 319.0, "completions/mean_terminated_length": 319.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.045932484781405644, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.002185521847422933, "learning_rate": 3.048555623847572e-06, "loss": 0.0001, "num_tokens": 2298678.0, "reward": 1.5677454471588135, "reward_std": 0.7252529859542847, "rewards/fixed_code_pass_all_test_reward/mean": 0.67578125, "rewards/fixed_code_pass_all_test_reward/std": 0.26930874586105347, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.14196428656578064, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16164414584636688, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 978.875, "completions/mean_terminated_length": 826.1428833007812, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.04611695259177274, "frac_reward_zero_std": 0.0, "grad_norm": 0.34375, "kl": 0.0008421583552262746, "learning_rate": 3.060848186846958e-06, "loss": 0.0, "num_tokens": 2313093.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 427.625, "completions/mean_terminated_length": 427.625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.046301420402139824, "frac_reward_zero_std": 1.0, "grad_norm": 0.006011962890625, "kl": 0.0003215114957129117, "learning_rate": 3.0731407498463432e-06, "loss": 0.0, "num_tokens": 2320610.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 142.625, "completions/mean_terminated_length": 142.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.04648588821250692, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.001002294768113643, "learning_rate": 3.085433312845729e-06, "loss": 0.0, "num_tokens": 2324495.0, "reward": 1.8732143640518188, "reward_std": 0.4592704176902771, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24821428954601288, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16815540194511414, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 141.625, "completions/mean_terminated_length": 141.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.04667035602287401, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.0024963335563370492, "learning_rate": 3.097725875845114e-06, "loss": 0.0001, "num_tokens": 2328356.0, "reward": 1.8869047164916992, "reward_std": 0.5938555002212524, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.386904776096344, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.181484192609787, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 469.875, "completions/mean_terminated_length": 469.875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.0468548238332411, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.0009627028048271313, "learning_rate": 3.110018438844499e-06, "loss": 0.0, "num_tokens": 2337459.0, "reward": 1.0107142925262451, "reward_std": 0.5353808999061584, "rewards/fixed_code_pass_all_test_reward/mean": 0.13571429252624512, "rewards/fixed_code_pass_all_test_reward/std": 0.3505098521709442, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 490.125, "completions/mean_terminated_length": 267.5714416503906, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.04703929164360819, "frac_reward_zero_std": 0.0, "grad_norm": 0.58984375, "kl": 0.0004161010256211739, "learning_rate": 3.1223110018438847e-06, "loss": 0.0, "num_tokens": 2344412.0, "reward": 0.8999999761581421, "reward_std": 0.37032803893089294, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 448.75, "completions/mean_terminated_length": 448.75, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.04722375945397528, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0010497734292584937, "learning_rate": 3.13460356484327e-06, "loss": 0.0, "num_tokens": 2352690.0, "reward": 0.9624999761581421, "reward_std": 0.4274091422557831, "rewards/fixed_code_pass_all_test_reward/mean": 0.08749999850988388, "rewards/fixed_code_pass_all_test_reward/std": 0.18077215552330017, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 452.25, "completions/mean_terminated_length": 452.25, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.04740822726434237, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.000924706386285834, "learning_rate": 3.146896127842655e-06, "loss": 0.0, "num_tokens": 2361876.0, "reward": 1.4166667461395264, "reward_std": 0.8248135447502136, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21675822138786316, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 357.125, "completions/mean_terminated_length": 357.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.047592695074709465, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0007295188952411991, "learning_rate": 3.159188690842041e-06, "loss": 0.0, "num_tokens": 2370669.0, "reward": 1.1875, "reward_std": 0.20133483409881592, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20133483409881592, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.04777716288507655, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.0006209965795278549, "learning_rate": 3.1714812538414266e-06, "loss": 0.0, "num_tokens": 2374943.0, "reward": 1.3208333253860474, "reward_std": 0.42833659052848816, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07083333283662796, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09829902648925781, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.047961630695443645, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.0008979507656476926, "learning_rate": 3.1837738168408117e-06, "loss": 0.0, "num_tokens": 2379167.0, "reward": 1.2000000476837158, "reward_std": 0.33806172013282776, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 565.25, "completions/mean_terminated_length": 565.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.04814609850581074, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.001632251533010276, "learning_rate": 3.196066379840197e-06, "loss": 0.0001, "num_tokens": 2389241.0, "reward": 1.4071428775787354, "reward_std": 0.7774894833564758, "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.3576526939868927, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10357142984867096, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1593993753194809, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 436.875, "completions/mean_terminated_length": 436.875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.048330566316177825, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.0007649754916201346, "learning_rate": 3.2083589428395824e-06, "loss": 0.0, "num_tokens": 2400696.0, "reward": 1.8504128456115723, "reward_std": 0.4546144902706146, "rewards/fixed_code_pass_all_test_reward/mean": 0.7217261791229248, "rewards/fixed_code_pass_all_test_reward/std": 0.39117541909217834, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1286865919828415, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12332616746425629, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.04851503412654492, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0012145635701017454, "learning_rate": 3.2206515058389676e-06, "loss": 0.0, "num_tokens": 2408414.0, "reward": 1.9739583730697632, "reward_std": 0.48630934953689575, "rewards/fixed_code_pass_all_test_reward/mean": 0.8218749761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.36386749148368835, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2770833373069763, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14609670639038086, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 406.375, "completions/mean_terminated_length": 406.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.048699501936912006, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0007907449908088893, "learning_rate": 3.2329440688383527e-06, "loss": 0.0, "num_tokens": 2418641.0, "reward": 1.493070363998413, "reward_std": 0.1627548336982727, "rewards/fixed_code_pass_all_test_reward/mean": 0.32926827669143677, "rewards/fixed_code_pass_all_test_reward/std": 0.08347839117050171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16380208730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1161389946937561, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 445.0, "completions/mean_terminated_length": 216.00001525878906, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.0488839697472791, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.0009289317604270764, "learning_rate": 3.2452366318377383e-06, "loss": 0.0, "num_tokens": 2425465.0, "reward": 0.8999999761581421, "reward_std": 0.37032803893089294, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 449.625, "completions/mean_terminated_length": 449.625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.04906843755764619, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0008610899858467747, "learning_rate": 3.257529194837124e-06, "loss": 0.0, "num_tokens": 2434054.0, "reward": 1.5, "reward_std": 0.2020304799079895, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 758.25, "completions/mean_terminated_length": 758.25, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.04925290536801328, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.0008359812964044977, "learning_rate": 3.2698217578365095e-06, "loss": 0.0, "num_tokens": 2448848.0, "reward": 0.78125, "reward_std": 0.4898523688316345, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 415.875, "completions/mean_terminated_length": 415.875, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.04943737317838037, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0008625024165667128, "learning_rate": 3.2821143208358946e-06, "loss": 0.0, "num_tokens": 2460095.0, "reward": 1.1375000476837158, "reward_std": 0.1767767071723938, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13750000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767767071723938, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.049621840988747466, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.0005388932331698015, "learning_rate": 3.2944068838352798e-06, "loss": 0.0, "num_tokens": 2465502.0, "reward": 2.135714530944824, "reward_std": 0.487132728099823, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.3967800438404083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3500000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1976047158241272, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 355.375, "completions/mean_terminated_length": 355.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.04980630879911455, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.0005458372361317743, "learning_rate": 3.3066994468346654e-06, "loss": 0.0, "num_tokens": 2471769.0, "reward": 0.9714285135269165, "reward_std": 0.4063633680343628, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09642857313156128, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11216138303279877, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.049990776609481646, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.002422758192551555, "learning_rate": 3.3189920098340505e-06, "loss": 0.0001, "num_tokens": 2479677.0, "reward": 0.8854166865348816, "reward_std": 0.7405187487602234, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2604166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23936715722084045, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 597.0, "completions/mean_terminated_length": 597.0, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.05017524441984873, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.0010120907900272869, "learning_rate": 3.331284572833436e-06, "loss": 0.0, "num_tokens": 2491141.0, "reward": 1.462499976158142, "reward_std": 0.2875388264656067, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 399.5, "completions/mean_terminated_length": 399.5, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.050359712230215826, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.0008637919600005262, "learning_rate": 3.3435771358328212e-06, "loss": 0.0, "num_tokens": 2499209.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 731.875, "completions/mean_terminated_length": 543.857177734375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.05054418004058292, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0006171542881929781, "learning_rate": 3.3558696988322072e-06, "loss": 0.0, "num_tokens": 2512256.0, "reward": 1.1812500953674316, "reward_std": 0.9395809769630432, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.21380901336669922, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.35624998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35999754071235657, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 702.5, "completions/mean_terminated_length": 702.5, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.05072864785095001, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.0006843165356258396, "learning_rate": 3.3681622618315924e-06, "loss": 0.0, "num_tokens": 2524500.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 295.375, "completions/mean_terminated_length": 295.375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.0509131156613171, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0020496993674896657, "learning_rate": 3.3804548248309775e-06, "loss": 0.0001, "num_tokens": 2531119.0, "reward": 1.0057623386383057, "reward_std": 0.8580093383789062, "rewards/fixed_code_pass_all_test_reward/mean": 0.27659574151039124, "rewards/fixed_code_pass_all_test_reward/std": 0.313114196062088, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1041666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23464767634868622, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 843.625, "completions/mean_terminated_length": 671.5714721679688, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.051097583471684194, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.000512303231516853, "learning_rate": 3.392747387830363e-06, "loss": 0.0, "num_tokens": 2545100.0, "reward": 0.9749999642372131, "reward_std": 0.4200339913368225, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1511857956647873, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 530.75, "completions/mean_terminated_length": 530.75, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.05128205128205128, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.000764724361943081, "learning_rate": 3.4050399508297483e-06, "loss": 0.0, "num_tokens": 2555674.0, "reward": 0.925000011920929, "reward_std": 0.3845219910144806, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.051466519092418374, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0009325122664449736, "learning_rate": 3.4173325138291334e-06, "loss": 0.0, "num_tokens": 2560128.0, "reward": 1.1459821462631226, "reward_std": 0.8224037885665894, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.14598214626312256, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10982829332351685, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 378.75, "completions/mean_terminated_length": 378.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.05165098690278547, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0006200600073498208, "learning_rate": 3.429625076828519e-06, "loss": 0.0, "num_tokens": 2573038.0, "reward": 1.322115421295166, "reward_std": 0.1872267872095108, "rewards/fixed_code_pass_all_test_reward/mean": 0.32211539149284363, "rewards/fixed_code_pass_all_test_reward/std": 0.18722684681415558, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 235.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.051835454713152554, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0020606209109246265, "learning_rate": 3.441917639827904e-06, "loss": 0.0001, "num_tokens": 2577678.0, "reward": 1.433333396911621, "reward_std": 0.989628791809082, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4333333671092987, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27313584089279175, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.05201992252351965, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.004435281211044639, "learning_rate": 3.45421020282729e-06, "loss": 0.0002, "num_tokens": 2581516.0, "reward": 2.080357074737549, "reward_std": 0.36620643734931946, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4553571343421936, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21765214204788208, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.052204390333886734, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.0017310522198386025, "learning_rate": 3.4665027658266753e-06, "loss": 0.0001, "num_tokens": 2592293.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 446.25, "completions/mean_terminated_length": 446.25, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.05238885814425383, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.001690764634986408, "learning_rate": 3.4787953288260605e-06, "loss": 0.0001, "num_tokens": 2600559.0, "reward": 1.4659090042114258, "reward_std": 0.35852742195129395, "rewards/fixed_code_pass_all_test_reward/mean": 0.5909090638160706, "rewards/fixed_code_pass_all_test_reward/std": 0.12856486439704895, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 454.375, "completions/mean_terminated_length": 454.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.05257332595462092, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.000661617137666326, "learning_rate": 3.491087891825446e-06, "loss": 0.0, "num_tokens": 2613314.0, "reward": 1.1189024448394775, "reward_std": 0.6313847303390503, "rewards/fixed_code_pass_all_test_reward/mean": 0.24390244483947754, "rewards/fixed_code_pass_all_test_reward/std": 0.45161959528923035, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.05275779376498801, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.0009282620376325212, "learning_rate": 3.503380454824831e-06, "loss": 0.0, "num_tokens": 2618275.0, "reward": 1.1852272748947144, "reward_std": 0.6555896401405334, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06022727116942406, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08786292374134064, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 395.75, "completions/mean_terminated_length": 159.71429443359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.0529422615753551, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.00921835952976835, "learning_rate": 3.5156730178242168e-06, "loss": 0.0004, "num_tokens": 2624961.0, "reward": 1.139880895614624, "reward_std": 0.8850737810134888, "rewards/fixed_code_pass_all_test_reward/mean": 0.7291666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.4537104666233063, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0357142873108387, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.06613001227378845, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 146.375, "completions/mean_terminated_length": 146.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.053126729385722195, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.004079639817064162, "learning_rate": 3.527965580823602e-06, "loss": 0.0002, "num_tokens": 2628892.0, "reward": 2.3499999046325684, "reward_std": 1.0014275312423706, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7250000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.337003618478775, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.05331119719608928, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.00238775651450851, "learning_rate": 3.540258143822987e-06, "loss": 0.0001, "num_tokens": 2633052.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 534.0, "completions/mean_terminated_length": 534.0, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.053495665006456375, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.0008708925306564197, "learning_rate": 3.552550706822373e-06, "loss": 0.0, "num_tokens": 2642244.0, "reward": 1.3869047164916992, "reward_std": 0.37962138652801514, "rewards/fixed_code_pass_all_test_reward/mean": 0.511904776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.2229902595281601, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.05368013281682346, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0006369253824232146, "learning_rate": 3.5648432698217582e-06, "loss": 0.0, "num_tokens": 2646674.0, "reward": 2.5916666984558105, "reward_std": 0.40853986144065857, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7166666984558105, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3505098521709442, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.053864600627190555, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0012665208141697804, "learning_rate": 3.577135832821144e-06, "loss": 0.0001, "num_tokens": 2650980.0, "reward": 1.475000023841858, "reward_std": 0.5548487305641174, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10690450668334961, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 620.0, "completions/mean_terminated_length": 416.0000305175781, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.05404906843755765, "frac_reward_zero_std": 0.0, "grad_norm": 0.74609375, "kl": 0.001158902741735801, "learning_rate": 3.589428395820529e-06, "loss": 0.0, "num_tokens": 2664212.0, "reward": 1.4812500476837158, "reward_std": 0.6771460771560669, "rewards/fixed_code_pass_all_test_reward/mean": 0.7312500476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.334810733795166, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 555.0, "completions/mean_terminated_length": 341.71429443359375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.054233536247924735, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.0010265377932228148, "learning_rate": 3.601720958819914e-06, "loss": 0.0, "num_tokens": 2674724.0, "reward": 1.28125, "reward_std": 0.7492555379867554, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 302.25, "completions/mean_terminated_length": 302.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.05441800405829183, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.0013963631572551094, "learning_rate": 3.6140135218192997e-06, "loss": 0.0001, "num_tokens": 2684390.0, "reward": 1.4265129566192627, "reward_std": 0.6662681102752686, "rewards/fixed_code_pass_all_test_reward/mean": 0.2023809552192688, "rewards/fixed_code_pass_all_test_reward/std": 0.36004117131233215, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3491319417953491, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.30469661951065063, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.05460247186865892, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.003591455504647456, "learning_rate": 3.626306084818685e-06, "loss": 0.0001, "num_tokens": 2690395.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 537.875, "completions/mean_terminated_length": 537.875, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.05478693967902601, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0010123924585059285, "learning_rate": 3.63859864781807e-06, "loss": 0.0, "num_tokens": 2699986.0, "reward": 1.7952245473861694, "reward_std": 0.571130633354187, "rewards/fixed_code_pass_all_test_reward/mean": 0.6087661981582642, "rewards/fixed_code_pass_all_test_reward/std": 0.26455986499786377, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3114583492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19704854488372803, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 240.375, "completions/mean_terminated_length": 240.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.0549714074893931, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.002006148915825179, "learning_rate": 3.650891210817456e-06, "loss": 0.0001, "num_tokens": 2705429.0, "reward": 0.8277777433395386, "reward_std": 0.5199036598205566, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07777778059244156, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10756237804889679, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.05515587529976019, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.0023325822257902473, "learning_rate": 3.663183773816841e-06, "loss": 0.0001, "num_tokens": 2712800.0, "reward": 1.8287036418914795, "reward_std": 0.2756976783275604, "rewards/fixed_code_pass_all_test_reward/mean": 0.8287037014961243, "rewards/fixed_code_pass_all_test_reward/std": 0.2756976783275604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.05534034311012728, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0016176275348698255, "learning_rate": 3.6754763368162267e-06, "loss": 0.0001, "num_tokens": 2718394.0, "reward": 0.8999999761581421, "reward_std": 0.7782764434814453, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2750000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3195979595184326, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 719.125, "completions/mean_terminated_length": 276.16668701171875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.055524810920494376, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.0009614037262508646, "learning_rate": 3.687768899815612e-06, "loss": 0.0, "num_tokens": 2731643.0, "reward": 0.9791666865348816, "reward_std": 0.6230844259262085, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2291666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2073548436164856, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.05570927873086146, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0012239632342243567, "learning_rate": 3.7000614628149974e-06, "loss": 0.0, "num_tokens": 2740072.0, "reward": 1.695399284362793, "reward_std": 0.3480432629585266, "rewards/fixed_code_pass_all_test_reward/mean": 0.0533854179084301, "rewards/fixed_code_pass_all_test_reward/std": 0.11211320012807846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6420139074325562, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3463256359100342, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 626.5, "completions/mean_terminated_length": 423.4285888671875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.055893746541228556, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.002472151507390663, "learning_rate": 3.7123540258143826e-06, "loss": 0.0001, "num_tokens": 2752028.0, "reward": 1.526374101638794, "reward_std": 0.7562454342842102, "rewards/fixed_code_pass_all_test_reward/mean": 0.5159574747085571, "rewards/fixed_code_pass_all_test_reward/std": 0.5191476941108704, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1354166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14971865713596344, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.05607821435159565, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0027653599936456885, "learning_rate": 3.7246465888137677e-06, "loss": 0.0001, "num_tokens": 2757665.0, "reward": 1.6371031999588013, "reward_std": 0.8635258674621582, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.38710319995880127, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2187030166387558, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 534.875, "completions/mean_terminated_length": 534.875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.056262682161962736, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.0018053827407129575, "learning_rate": 3.7369391518131533e-06, "loss": 0.0001, "num_tokens": 2769776.0, "reward": 0.4730769395828247, "reward_std": 0.7279819846153259, "rewards/fixed_code_pass_all_test_reward/mean": 0.17307692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.3204762041568756, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 457.25, "completions/mean_terminated_length": 457.25, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.05644714997232983, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.001386056832416216, "learning_rate": 3.7492317148125385e-06, "loss": 0.0001, "num_tokens": 2780794.0, "reward": 1.138157844543457, "reward_std": 0.2794753313064575, "rewards/fixed_code_pass_all_test_reward/mean": 0.1381578892469406, "rewards/fixed_code_pass_all_test_reward/std": 0.27947530150413513, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.056631617782696916, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.0015366562292911112, "learning_rate": 3.7615242778119245e-06, "loss": 0.0001, "num_tokens": 2790952.0, "reward": 1.383333444595337, "reward_std": 0.657798171043396, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.15430335700511932, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25833332538604736, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39107099175453186, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 408.375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.05681608559306401, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.0011579195779631846, "learning_rate": 3.7738168408113096e-06, "loss": 0.0, "num_tokens": 2798507.0, "reward": 1.2583333253860474, "reward_std": 0.6565300226211548, "rewards/fixed_code_pass_all_test_reward/mean": 0.38333332538604736, "rewards/fixed_code_pass_all_test_reward/std": 0.4642796218395233, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0570005534034311, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.00206302272999892, "learning_rate": 3.7861094038106948e-06, "loss": 0.0001, "num_tokens": 2802815.0, "reward": 0.925000011920929, "reward_std": 0.39910614490509033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 119.5, "completions/mean_terminated_length": 119.5, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.05718502121379819, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.0023294645434361883, "learning_rate": 3.7984019668100804e-06, "loss": 0.0001, "num_tokens": 2806627.0, "reward": 1.5499999523162842, "reward_std": 0.5732114911079407, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17499999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19820624589920044, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 663.0, "completions/mean_terminated_length": 663.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.057369489024165284, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.0018006655409408268, "learning_rate": 3.8106945298094655e-06, "loss": 0.0001, "num_tokens": 2817339.0, "reward": 1.298076868057251, "reward_std": 0.6222997903823853, "rewards/fixed_code_pass_all_test_reward/mean": 0.42307692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.3760024905204773, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.05755395683453238, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.004290346085326746, "learning_rate": 3.822987092808851e-06, "loss": 0.0002, "num_tokens": 2825996.0, "reward": 1.2864583730697632, "reward_std": 0.5705736875534058, "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.3505098521709442, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2614583373069763, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16838033497333527, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 444.875, "completions/mean_terminated_length": 444.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.057738424644899464, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.0005983375976938987, "learning_rate": 3.835279655808236e-06, "loss": 0.0, "num_tokens": 2833707.0, "reward": 1.9874999523162842, "reward_std": 0.3758324384689331, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11250000447034836, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11259917169809341, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 678.5, "completions/mean_terminated_length": 678.5, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.05792289245526656, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.0021470246865646914, "learning_rate": 3.847572218807621e-06, "loss": 0.0001, "num_tokens": 2845751.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.058107360265633644, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.0011982954783889, "learning_rate": 3.859864781807007e-06, "loss": 0.0, "num_tokens": 2850516.0, "reward": 2.4476189613342285, "reward_std": 0.28244155645370483, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4476190507411957, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2824415862560272, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1420.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 564.0, "completions/mean_terminated_length": 564.0, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.05829182807600074, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.0023300919638131745, "learning_rate": 3.8721573448063925e-06, "loss": 0.0001, "num_tokens": 2859732.0, "reward": 1.1941176652908325, "reward_std": 0.6294156908988953, "rewards/fixed_code_pass_all_test_reward/mean": 0.29411762952804565, "rewards/fixed_code_pass_all_test_reward/std": 0.3759976029396057, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.05847629588636783, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0019673105562105775, "learning_rate": 3.884449907805778e-06, "loss": 0.0001, "num_tokens": 2866645.0, "reward": 0.9539474248886108, "reward_std": 0.565067708492279, "rewards/fixed_code_pass_all_test_reward/mean": 0.20394736528396606, "rewards/fixed_code_pass_all_test_reward/std": 0.22051043808460236, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 420.5, "completions/mean_terminated_length": 188.00001525878906, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.05866076369673492, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0015101574717846233, "learning_rate": 3.896742470805163e-06, "loss": 0.0001, "num_tokens": 2873033.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.05884523150710201, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.0024509090289939195, "learning_rate": 3.909035033804549e-06, "loss": 0.0001, "num_tokens": 2879563.0, "reward": 2.633333206176758, "reward_std": 0.2916836738586426, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7583333253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17066629230976105, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 389.25, "completions/mean_terminated_length": 389.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.059029699317469105, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.002297013104907819, "learning_rate": 3.921327596803934e-06, "loss": 0.0001, "num_tokens": 2888789.0, "reward": 0.8787878751754761, "reward_std": 0.35524222254753113, "rewards/fixed_code_pass_all_test_reward/mean": 0.0037878789007663727, "rewards/fixed_code_pass_all_test_reward/std": 0.010713739320635796, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 540.75, "completions/mean_terminated_length": 540.75, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.05921416712783619, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.001161170828709146, "learning_rate": 3.933620159803319e-06, "loss": 0.0, "num_tokens": 2901835.0, "reward": 1.8041666746139526, "reward_std": 0.37945184111595154, "rewards/fixed_code_pass_all_test_reward/mean": 0.6944444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.31173408031463623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10972222685813904, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09547754377126694, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 425.0, "completions/mean_terminated_length": 425.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.059398634938203285, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0017571363496244885, "learning_rate": 3.945912722802704e-06, "loss": 0.0001, "num_tokens": 2911555.0, "reward": 1.0732684135437012, "reward_std": 0.08814657479524612, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07326839864253998, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08814654499292374, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.05958310274857037, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.0019742092699743807, "learning_rate": 3.95820528580209e-06, "loss": 0.0001, "num_tokens": 2916096.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 306.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.059767570558937465, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0026281133759766817, "learning_rate": 3.9704978488014755e-06, "loss": 0.0001, "num_tokens": 2926170.0, "reward": 1.337499976158142, "reward_std": 0.6162965297698975, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21250000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29489707946777344, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 603.625, "completions/mean_terminated_length": 603.625, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.05995203836930456, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.0015445830431417562, "learning_rate": 3.982790411800861e-06, "loss": 0.0001, "num_tokens": 2940119.0, "reward": 1.1145832538604736, "reward_std": 0.5016985535621643, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2395833283662796, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24134832620620728, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.060136506179671645, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.008719000270502875, "learning_rate": 3.995082974800247e-06, "loss": 0.0003, "num_tokens": 2944571.0, "reward": 1.1053571701049805, "reward_std": 0.5930077433586121, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10535714030265808, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09381375461816788, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 291.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.06032097399003874, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.0032587236346444115, "learning_rate": 4.007375537799632e-06, "loss": 0.0001, "num_tokens": 2951253.0, "reward": 1.3116161823272705, "reward_std": 0.6666498780250549, "rewards/fixed_code_pass_all_test_reward/mean": 0.39772725105285645, "rewards/fixed_code_pass_all_test_reward/std": 0.38855376839637756, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03888889029622078, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07582584768533707, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 351.375, "completions/mean_terminated_length": 351.375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.06050544180040583, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.002038333550444804, "learning_rate": 4.019668100799017e-06, "loss": 0.0001, "num_tokens": 2960208.0, "reward": 1.9250000715255737, "reward_std": 0.8680849075317383, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.30000001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23904572427272797, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.06068990961077292, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0034469623933546245, "learning_rate": 4.031960663798402e-06, "loss": 0.0001, "num_tokens": 2971396.0, "reward": 1.9594697952270508, "reward_std": 0.41454896330833435, "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.4291613698005676, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5958333015441895, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1577998548746109, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.06087437742114001, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.0028711779305012897, "learning_rate": 4.044253226797787e-06, "loss": 0.0001, "num_tokens": 2981339.0, "reward": 1.752164602279663, "reward_std": 0.5881946682929993, "rewards/fixed_code_pass_all_test_reward/mean": 0.6545454263687134, "rewards/fixed_code_pass_all_test_reward/std": 0.2887910008430481, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22261905670166016, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24376583099365234, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 316.75, "completions/mean_terminated_length": 316.75, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.0610588452315071, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.001000301905150991, "learning_rate": 4.056545789797173e-06, "loss": 0.0, "num_tokens": 2987465.0, "reward": 2.377380847930908, "reward_std": 0.2636283338069916, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4523809552192688, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15211617946624756, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 364.5, "completions/mean_terminated_length": 364.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.06124331304187419, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.00948553130729124, "learning_rate": 4.068838352796558e-06, "loss": 0.0004, "num_tokens": 2997661.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.061427780852241286, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0043021846358897164, "learning_rate": 4.0811309157959435e-06, "loss": 0.0002, "num_tokens": 3004166.0, "reward": 0.9801136255264282, "reward_std": 0.37999671697616577, "rewards/fixed_code_pass_all_test_reward/mean": 0.011363636702299118, "rewards/fixed_code_pass_all_test_reward/std": 0.032141219824552536, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12938730418682098, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 1241.5, "completions/mean_terminated_length": 1241.5, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "epoch": 0.06161224866260837, "frac_reward_zero_std": 0.0, "grad_norm": 0.61328125, "kl": 0.0006506732315756381, "learning_rate": 4.0934234787953295e-06, "loss": 0.0, "num_tokens": 3031626.0, "reward": 1.5949866771697998, "reward_std": 0.6096673607826233, "rewards/fixed_code_pass_all_test_reward/mean": 0.6322463750839233, "rewards/fixed_code_pass_all_test_reward/std": 0.31111836433410645, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08774038404226303, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.06541642546653748, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.061796716472975466, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0048949634219752625, "learning_rate": 4.105716041794715e-06, "loss": 0.0002, "num_tokens": 3038310.0, "reward": 0.7824999690055847, "reward_std": 0.48631995916366577, "rewards/fixed_code_pass_all_test_reward/mean": 0.032499998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.06041523069143295, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 527.75, "completions/mean_terminated_length": 527.75, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.06198118428334256, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.0027794560701295268, "learning_rate": 4.1180086047941e-06, "loss": 0.0001, "num_tokens": 3052444.0, "reward": 1.8071969747543335, "reward_std": 0.4818193018436432, "rewards/fixed_code_pass_all_test_reward/mean": 0.2613636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.43309786915779114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5458333492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1816481202840805, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.062165652093709646, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0050525745609775186, "learning_rate": 4.130301167793485e-06, "loss": 0.0002, "num_tokens": 3056366.0, "reward": 2.0250000953674316, "reward_std": 0.4200340211391449, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15000000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.06235011990407674, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0023422143567586318, "learning_rate": 4.14259373079287e-06, "loss": 0.0001, "num_tokens": 3065954.0, "reward": 1.375, "reward_std": 0.4464142918586731, "rewards/fixed_code_pass_all_test_reward/mean": 0.4750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320366859436, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 337.625, "completions/mean_terminated_length": 337.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.06253458771444383, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0025294355291407555, "learning_rate": 4.154886293792256e-06, "loss": 0.0001, "num_tokens": 3075887.0, "reward": 1.762046217918396, "reward_std": 1.190953254699707, "rewards/fixed_code_pass_all_test_reward/mean": 0.5037128925323486, "rewards/fixed_code_pass_all_test_reward/std": 0.5306423306465149, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5083333253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.32452651858329773, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 458.375, "completions/mean_terminated_length": 458.375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.06271905552481093, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.002262492518639192, "learning_rate": 4.167178856791641e-06, "loss": 0.0001, "num_tokens": 3084506.0, "reward": 1.203125, "reward_std": 0.35473519563674927, "rewards/fixed_code_pass_all_test_reward/mean": 0.328125, "rewards/fixed_code_pass_all_test_reward/std": 0.2603869140148163, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 483.625, "completions/mean_terminated_length": 483.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.06290352333517801, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.0018839370968635194, "learning_rate": 4.179471419791027e-06, "loss": 0.0001, "num_tokens": 3098079.0, "reward": 1.72029447555542, "reward_std": 0.3824726343154907, "rewards/fixed_code_pass_all_test_reward/mean": 0.32758620381355286, "rewards/fixed_code_pass_all_test_reward/std": 0.37548452615737915, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.39270833134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2859624922275543, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 522.125, "completions/mean_terminated_length": 522.125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.0630879911455451, "frac_reward_zero_std": 0.0, "grad_norm": 0.86328125, "kl": 0.0011134747182950377, "learning_rate": 4.1917639827904124e-06, "loss": 0.0, "num_tokens": 3112736.0, "reward": 0.9541666507720947, "reward_std": 0.8706752061843872, "rewards/fixed_code_pass_all_test_reward/mean": 0.15416666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.34362101554870605, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17499999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36154431104660034, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.06327245895591219, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.0011202387322555296, "learning_rate": 4.204056545789798e-06, "loss": 0.0, "num_tokens": 3118196.0, "reward": 2.416069746017456, "reward_std": 0.33996936678886414, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4160696864128113, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.33996933698654175, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 562.625, "completions/mean_terminated_length": 562.625, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.06345692676627929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0223388671875, "kl": 0.002104606493958272, "learning_rate": 4.216349108789183e-06, "loss": 0.0001, "num_tokens": 3128617.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 519.125, "completions/mean_terminated_length": 300.71429443359375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.06364139457664637, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.002436648264847463, "learning_rate": 4.228641671788568e-06, "loss": 0.0001, "num_tokens": 3137194.0, "reward": 1.375, "reward_std": 0.6086390018463135, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.320287823677063, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 384.625, "completions/mean_terminated_length": 384.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.06382586238701346, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.00216316276055295, "learning_rate": 4.240934234787953e-06, "loss": 0.0001, "num_tokens": 3145999.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.06401033019738056, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.005547874636249617, "learning_rate": 4.253226797787339e-06, "loss": 0.0002, "num_tokens": 3153721.0, "reward": 1.9562500715255737, "reward_std": 0.6997129917144775, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.45624998211860657, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3922804296016693, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 433.5, "completions/mean_terminated_length": 433.5, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.06419479800774765, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.0009540260725771077, "learning_rate": 4.265519360786724e-06, "loss": 0.0, "num_tokens": 3163013.0, "reward": 1.46875, "reward_std": 0.5011148452758789, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.5011148452758789, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 825.375, "completions/mean_terminated_length": 825.375, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.06437926581811473, "frac_reward_zero_std": 0.0, "grad_norm": 0.75390625, "kl": 0.0023559652909170836, "learning_rate": 4.27781192378611e-06, "loss": 0.0001, "num_tokens": 3179560.0, "reward": 0.9812500476837158, "reward_std": 0.41052013635635376, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10624999552965164, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11475907266139984, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 392.125, "completions/mean_terminated_length": 392.125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.06456373362848183, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.00219918859511381, "learning_rate": 4.290104486785495e-06, "loss": 0.0001, "num_tokens": 3187633.0, "reward": 1.0833333730697632, "reward_std": 0.051434475928545, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.05143444985151291, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.06474820143884892, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.0018160033723688684, "learning_rate": 4.3023970497848805e-06, "loss": 0.0001, "num_tokens": 3191650.0, "reward": 1.3125, "reward_std": 0.49982136487960815, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11877350509166718, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 238.75, "completions/mean_terminated_length": 238.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.06493266924921601, "frac_reward_zero_std": 1.0, "grad_norm": 0.0274658203125, "kl": 0.0023928051959956065, "learning_rate": 4.314689612784266e-06, "loss": 0.0001, "num_tokens": 3199184.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.06511713705958311, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.003245831248932518, "learning_rate": 4.326982175783651e-06, "loss": 0.0001, "num_tokens": 3204578.0, "reward": 1.609090805053711, "reward_std": 0.3382013738155365, "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.30900493264198303, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14880475401878357, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.0653016048699502, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0021823915813001804, "learning_rate": 4.339274738783037e-06, "loss": 0.0001, "num_tokens": 3210032.0, "reward": 1.3216270208358765, "reward_std": 0.18994054198265076, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3216269910335541, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18994055688381195, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 175.375, "completions/mean_terminated_length": 175.375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.06548607268031728, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.013498928266926669, "learning_rate": 4.351567301782422e-06, "loss": 0.0005, "num_tokens": 3214491.0, "reward": 1.3216270208358765, "reward_std": 0.5403243899345398, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4466269910335541, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1983499974012375, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 348.375, "completions/mean_terminated_length": 348.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.06567054049068438, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.004377463046694174, "learning_rate": 4.363859864781808e-06, "loss": 0.0002, "num_tokens": 3225238.0, "reward": 1.412500023841858, "reward_std": 0.6916595101356506, "rewards/fixed_code_pass_all_test_reward/mean": 0.36250001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17500001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 375.375, "completions/mean_terminated_length": 375.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.06585500830105147, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.002586514950962737, "learning_rate": 4.376152427781193e-06, "loss": 0.0001, "num_tokens": 3234825.0, "reward": 1.8545454740524292, "reward_std": 0.464717298746109, "rewards/fixed_code_pass_all_test_reward/mean": 0.4545454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.45454543828964233, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.40000003576278687, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.32071349024772644, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 425.25, "completions/mean_terminated_length": 425.25, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.06603947611141855, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0048077270039357245, "learning_rate": 4.388444990780578e-06, "loss": 0.0002, "num_tokens": 3243403.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.06622394392178566, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.003359046357218176, "learning_rate": 4.400737553779963e-06, "loss": 0.0001, "num_tokens": 3253763.0, "reward": 2.65238094329834, "reward_std": 0.15867504477500916, "rewards/fixed_code_pass_all_test_reward/mean": 0.860714316368103, "rewards/fixed_code_pass_all_test_reward/std": 0.08962181955575943, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14847104251384735, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.06640841173215274, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0020520018006209284, "learning_rate": 4.4130301167793486e-06, "loss": 0.0001, "num_tokens": 3262623.0, "reward": 1.5074405670166016, "reward_std": 0.6422489285469055, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3824405074119568, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3332238495349884, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 866.375, "completions/mean_terminated_length": 697.5714721679688, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.06659287954251983, "frac_reward_zero_std": 0.0, "grad_norm": 0.59765625, "kl": 0.001482957879488822, "learning_rate": 4.425322679778734e-06, "loss": 0.0001, "num_tokens": 3281994.0, "reward": 0.9249999523162842, "reward_std": 0.3845219612121582, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.06677734735288691, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0012272042586118914, "learning_rate": 4.43761524277812e-06, "loss": 0.0, "num_tokens": 3286928.0, "reward": 0.949999988079071, "reward_std": 0.41057446599006653, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14880475401878357, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.06696181516325402, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.003250627822126262, "learning_rate": 4.449907805777505e-06, "loss": 0.0001, "num_tokens": 3295429.0, "reward": 1.9799602031707764, "reward_std": 0.11497937142848969, "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.06734350323677063, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05138888955116272, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07673622667789459, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 497.75, "completions/mean_terminated_length": 497.75, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.0671462829736211, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.001615637171198614, "learning_rate": 4.462200368776891e-06, "loss": 0.0001, "num_tokens": 3304379.0, "reward": 1.901893973350525, "reward_std": 0.561603307723999, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15189394354820251, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11964955180883408, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.06733075078398819, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.003006309416377917, "learning_rate": 4.474492931776276e-06, "loss": 0.0001, "num_tokens": 3313551.0, "reward": 1.6535561084747314, "reward_std": 0.4538537263870239, "rewards/fixed_code_pass_all_test_reward/mean": 0.7629310488700867, "rewards/fixed_code_pass_all_test_reward/std": 0.28221595287323, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.015625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 353.5, "completions/mean_terminated_length": 353.5, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.06751521859435529, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.00882939895382151, "learning_rate": 4.486785494775661e-06, "loss": 0.0004, "num_tokens": 3320427.0, "reward": 1.0458333492279053, "reward_std": 0.46254557371139526, "rewards/fixed_code_pass_all_test_reward/mean": 0.03333333507180214, "rewards/fixed_code_pass_all_test_reward/std": 0.0471404530107975, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13750000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20658792555332184, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 423.0, "completions/mean_terminated_length": 423.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.06769968640472238, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.003555713774403557, "learning_rate": 4.499078057775046e-06, "loss": 0.0001, "num_tokens": 3330411.0, "reward": 1.9060606956481934, "reward_std": 0.8507391214370728, "rewards/fixed_code_pass_all_test_reward/mean": 0.7727272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.42362356185913086, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25833335518836975, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24800795316696167, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 406.125, "completions/mean_terminated_length": 406.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.06788415421508946, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.0035248897620476782, "learning_rate": 4.5113706207744315e-06, "loss": 0.0001, "num_tokens": 3338532.0, "reward": 1.3515625, "reward_std": 0.2542880177497864, "rewards/fixed_code_pass_all_test_reward/mean": 0.3515625, "rewards/fixed_code_pass_all_test_reward/std": 0.2542880177497864, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 457.125, "completions/mean_terminated_length": 457.125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.06806862202545656, "frac_reward_zero_std": 1.0, "grad_norm": 0.01806640625, "kl": 0.0016632177139399573, "learning_rate": 4.5236631837738175e-06, "loss": 0.0001, "num_tokens": 3351981.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 205.0, "completions/mean_terminated_length": 205.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.06825308983582365, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0027365024798200466, "learning_rate": 4.535955746773203e-06, "loss": 0.0001, "num_tokens": 3356501.0, "reward": 1.524999976158142, "reward_std": 0.8564878106117249, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4000000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3505098521709442, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.06843755764619074, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.005875723465578631, "learning_rate": 4.548248309772588e-06, "loss": 0.0002, "num_tokens": 3360207.0, "reward": 2.460416793823242, "reward_std": 0.2207377851009369, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.46041667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22073785960674286, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 368.0, "completions/mean_terminated_length": 368.0, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.06862202545655784, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.0013687673272215761, "learning_rate": 4.560540872771974e-06, "loss": 0.0001, "num_tokens": 3366919.0, "reward": 2.6416666507720947, "reward_std": 0.3365322947502136, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7666666507720947, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08544931560754776, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 293.125, "completions/mean_terminated_length": 293.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.06880649326692492, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.001692331163212657, "learning_rate": 4.572833435771359e-06, "loss": 0.0001, "num_tokens": 3372256.0, "reward": 0.9035714864730835, "reward_std": 0.758691132068634, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2785714268684387, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2626396715641022, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.06899096107729201, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.003037985705304891, "learning_rate": 4.585125998770744e-06, "loss": 0.0001, "num_tokens": 3381388.0, "reward": 0.981249988079071, "reward_std": 0.45034709572792053, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10625000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21784251928329468, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.06917542888765911, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.00409601884894073, "learning_rate": 4.597418561770129e-06, "loss": 0.0002, "num_tokens": 3386342.0, "reward": 2.538094997406006, "reward_std": 0.15993192791938782, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.538095235824585, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1599319726228714, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.0693598966980262, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.005197781836614013, "learning_rate": 4.609711124769514e-06, "loss": 0.0002, "num_tokens": 3394800.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 512.625, "completions/mean_terminated_length": 512.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.06954436450839328, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.009709350924822502, "learning_rate": 4.6220036877689e-06, "loss": 0.0004, "num_tokens": 3407373.0, "reward": 1.2937500476837158, "reward_std": 0.9726390838623047, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16874998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2016671597957611, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 481.25, "completions/mean_terminated_length": 481.25, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.06972883231876037, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.0028098439797759056, "learning_rate": 4.6342962507682856e-06, "loss": 0.0001, "num_tokens": 3416087.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.06991330012912747, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0036708306579384953, "learning_rate": 4.646588813767671e-06, "loss": 0.0001, "num_tokens": 3422771.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.07009776793949456, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.002353470103116706, "learning_rate": 4.658881376767057e-06, "loss": 0.0001, "num_tokens": 3426691.0, "reward": 1.6678571701049805, "reward_std": 0.5577834844589233, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04285714402794838, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08081220835447311, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 459.5, "completions/mean_terminated_length": 459.5, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.07028223574986164, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0021507589844986796, "learning_rate": 4.671173939766442e-06, "loss": 0.0001, "num_tokens": 3435263.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 395.875, "completions/mean_terminated_length": 395.875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.07046670356022874, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.0028687994345091283, "learning_rate": 4.683466502765827e-06, "loss": 0.0001, "num_tokens": 3448286.0, "reward": 1.375, "reward_std": 0.2712405323982239, "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, "rewards/fixed_code_pass_all_test_reward/std": 0.10690450668334961, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2750000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2121320515871048, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 386.25, "completions/mean_terminated_length": 386.25, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.07065117137059583, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.0034375410759821534, "learning_rate": 4.695759065765212e-06, "loss": 0.0001, "num_tokens": 3457984.0, "reward": 1.399999976158142, "reward_std": 0.8564878702163696, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15000000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18322508037090302, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.07083563918096292, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0067488054919522256, "learning_rate": 4.708051628764598e-06, "loss": 0.0003, "num_tokens": 3468068.0, "reward": 1.8322703838348389, "reward_std": 0.4330956041812897, "rewards/fixed_code_pass_all_test_reward/mean": 0.5510203838348389, "rewards/fixed_code_pass_all_test_reward/std": 0.3898211121559143, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.28125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2590332329273224, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.07102010699133002, "frac_reward_zero_std": 0.0, "grad_norm": 3.4375, "kl": 0.004908254253678024, "learning_rate": 4.720344191763983e-06, "loss": 0.0002, "num_tokens": 3471955.0, "reward": 2.0833334922790527, "reward_std": 0.3338092267513275, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14880476891994476, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 498.625, "completions/mean_terminated_length": 498.625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.0712045748016971, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.003505851564113982, "learning_rate": 4.7326367547633685e-06, "loss": 0.0001, "num_tokens": 3486608.0, "reward": 1.3035714626312256, "reward_std": 0.7177771329879761, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3035714328289032, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24095898866653442, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 397.75, "completions/mean_terminated_length": 397.75, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.07138904261206419, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.003063147894863505, "learning_rate": 4.744929317762754e-06, "loss": 0.0001, "num_tokens": 3495134.0, "reward": 1.1833332777023315, "reward_std": 0.17728105187416077, "rewards/fixed_code_pass_all_test_reward/mean": 0.10833334177732468, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14880475401878357, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 260.0, "completions/mean_terminated_length": 260.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.07157351042243129, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.007044025289360434, "learning_rate": 4.75722188076214e-06, "loss": 0.0003, "num_tokens": 3504782.0, "reward": 1.475000023841858, "reward_std": 0.39430472254753113, "rewards/fixed_code_pass_all_test_reward/mean": 0.23333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.35276684165000916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24166667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2980092763900757, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 579.25, "completions/mean_terminated_length": 579.25, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.07175797823279838, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.0018041631483356468, "learning_rate": 4.769514443761525e-06, "loss": 0.0001, "num_tokens": 3519864.0, "reward": 1.6166667938232422, "reward_std": 0.4193721115589142, "rewards/fixed_code_pass_all_test_reward/mean": 0.14999999105930328, "rewards/fixed_code_pass_all_test_reward/std": 0.34641018509864807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.46666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35634830594062805, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.07194244604316546, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.005487099202582613, "learning_rate": 4.78180700676091e-06, "loss": 0.0002, "num_tokens": 3528291.0, "reward": 1.28125, "reward_std": 0.27508115768432617, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.28125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27508115768432617, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 427.125, "completions/mean_terminated_length": 427.125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.07212691385353256, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.006066471891244873, "learning_rate": 4.794099569760295e-06, "loss": 0.0002, "num_tokens": 3538884.0, "reward": 1.9128401279449463, "reward_std": 0.3698350787162781, "rewards/fixed_code_pass_all_test_reward/mean": 0.6122449040412903, "rewards/fixed_code_pass_all_test_reward/std": 0.37009018659591675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.300595223903656, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2631625831127167, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 505.75, "completions/mean_terminated_length": 505.75, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.07231138166389965, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.0038074051262810826, "learning_rate": 4.806392132759681e-06, "loss": 0.0002, "num_tokens": 3548178.0, "reward": 0.9698717594146729, "reward_std": 0.667245090007782, "rewards/fixed_code_pass_all_test_reward/mean": 0.33653849363327026, "rewards/fixed_code_pass_all_test_reward/std": 0.08158925175666809, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13333332538604736, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20548047125339508, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 373.125, "completions/mean_terminated_length": 373.125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.07249584947426674, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.004919704049825668, "learning_rate": 4.818684695759066e-06, "loss": 0.0002, "num_tokens": 3558939.0, "reward": 1.3795139789581299, "reward_std": 0.6284568309783936, "rewards/fixed_code_pass_all_test_reward/mean": 0.2222222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.31426966190338135, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2822916507720947, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1877412497997284, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.07268031728463382, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.006421609519748017, "learning_rate": 4.830977258758451e-06, "loss": 0.0003, "num_tokens": 3568337.0, "reward": 1.696279764175415, "reward_std": 0.4803399443626404, "rewards/fixed_code_pass_all_test_reward/mean": 0.386904776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.5078611969947815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.30937498807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1737447828054428, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.07286478509500093, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.008019195171073079, "learning_rate": 4.8432698217578365e-06, "loss": 0.0003, "num_tokens": 3575593.0, "reward": 1.2916667461395264, "reward_std": 0.25370272994041443, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25370272994041443, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.07304925290536801, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.004641923296730965, "learning_rate": 4.8555623847572225e-06, "loss": 0.0002, "num_tokens": 3583161.0, "reward": 1.658333420753479, "reward_std": 0.29907265305519104, "rewards/fixed_code_pass_all_test_reward/mean": 0.5583333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.2958710789680481, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10690450668334961, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 278.125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.0732337207157351, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.004452351247891784, "learning_rate": 4.867854947756608e-06, "loss": 0.0002, "num_tokens": 3592306.0, "reward": 1.1979167461395264, "reward_std": 0.3505593538284302, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.0734181885261022, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.012064472830388695, "learning_rate": 4.880147510755993e-06, "loss": 0.0005, "num_tokens": 3597748.0, "reward": 1.4421026706695557, "reward_std": 0.5962096452713013, "rewards/fixed_code_pass_all_test_reward/mean": 0.11918604373931885, "rewards/fixed_code_pass_all_test_reward/std": 0.024666516110301018, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4479166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27479249238967896, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 466.0, "completions/mean_terminated_length": 466.0, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.07360265633646929, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.004319610423408449, "learning_rate": 4.892440073755379e-06, "loss": 0.0002, "num_tokens": 3608652.0, "reward": 1.6593749523162842, "reward_std": 0.3294709324836731, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.35708382725715637, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15937499701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11576502025127411, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.07378712414683637, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.0052676523046102375, "learning_rate": 4.904732636754764e-06, "loss": 0.0002, "num_tokens": 3615083.0, "reward": 1.1919642686843872, "reward_std": 0.1759372055530548, "rewards/fixed_code_pass_all_test_reward/mean": 0.1607142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.11921756714582443, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 254.625, "completions/mean_terminated_length": 254.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.07397159195720347, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.005419381806859747, "learning_rate": 4.917025199754149e-06, "loss": 0.0002, "num_tokens": 3621520.0, "reward": 1.3772727251052856, "reward_std": 0.18253104388713837, "rewards/fixed_code_pass_all_test_reward/mean": 0.22727271914482117, "rewards/fixed_code_pass_all_test_reward/std": 0.1818181872367859, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15000000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17728105187416077, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.07415605976757056, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.004259753070073202, "learning_rate": 4.929317762753534e-06, "loss": 0.0002, "num_tokens": 3628696.0, "reward": 1.2333333492279053, "reward_std": 0.1636350005865097, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333283662796, "rewards/fixed_code_pass_all_test_reward/std": 0.18352431058883667, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.07434052757793765, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0029250553343445063, "learning_rate": 4.9416103257529195e-06, "loss": 0.0001, "num_tokens": 3633437.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 490.25, "completions/mean_terminated_length": 490.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.07452499538830475, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.002476187153661158, "learning_rate": 4.9539028887523055e-06, "loss": 0.0001, "num_tokens": 3641631.0, "reward": 2.3520562648773193, "reward_std": 0.29544368386268616, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.30860668420791626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5187229514122009, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15452302992343903, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 234.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.07470946319867183, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.003387229284271598, "learning_rate": 4.966195451751691e-06, "loss": 0.0001, "num_tokens": 3647375.0, "reward": 1.8701388835906982, "reward_std": 0.40359315276145935, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12013889849185944, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.05968299135565758, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.07489393100903892, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.0037029007507953793, "learning_rate": 4.978488014751076e-06, "loss": 0.0001, "num_tokens": 3651101.0, "reward": 1.125, "reward_std": 0.1832250952720642, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1832250952720642, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 511.625, "completions/mean_terminated_length": 511.625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.07507839881940602, "frac_reward_zero_std": 1.0, "grad_norm": 0.0255126953125, "kl": 0.0019787335622822866, "learning_rate": 4.990780577750462e-06, "loss": 0.0001, "num_tokens": 3659794.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 374.25, "completions/mean_terminated_length": 374.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.0752628666297731, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.004231046092172619, "learning_rate": 5.003073140749847e-06, "loss": 0.0002, "num_tokens": 3669828.0, "reward": 1.3035794496536255, "reward_std": 0.2578947842121124, "rewards/fixed_code_pass_all_test_reward/mean": 0.204861119389534, "rewards/fixed_code_pass_all_test_reward/std": 0.2859656512737274, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0987183153629303, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09073367714881897, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.07544733444014019, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.003107618962530978, "learning_rate": 5.015365703749232e-06, "loss": 0.0001, "num_tokens": 3676236.0, "reward": 1.1421053409576416, "reward_std": 0.6393507122993469, "rewards/fixed_code_pass_all_test_reward/mean": 0.21710526943206787, "rewards/fixed_code_pass_all_test_reward/std": 0.4081069231033325, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.07563180225050728, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.006817752990173176, "learning_rate": 5.027658266748617e-06, "loss": 0.0003, "num_tokens": 3683159.0, "reward": 2.3645834922790527, "reward_std": 0.4541202485561371, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4895833730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23586006462574005, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 447.375, "completions/mean_terminated_length": 447.375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.07581627006087438, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.0041386488592252135, "learning_rate": 5.039950829748002e-06, "loss": 0.0002, "num_tokens": 3694586.0, "reward": 1.4535713195800781, "reward_std": 0.6086714863777161, "rewards/fixed_code_pass_all_test_reward/mean": 0.0535714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.10628911852836609, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5249999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.26049405336380005, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 356.375, "completions/mean_terminated_length": 356.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.07600073787124147, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.008600425469921902, "learning_rate": 5.052243392747388e-06, "loss": 0.0003, "num_tokens": 3705789.0, "reward": 1.1145833730697632, "reward_std": 0.1602175235748291, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1145833432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1602174937725067, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 392.125, "completions/mean_terminated_length": 392.125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.07618520568160855, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.0020750991970999166, "learning_rate": 5.0645359557467735e-06, "loss": 0.0001, "num_tokens": 3712302.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 364.625, "completions/mean_terminated_length": 364.625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.07636967349197565, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.004055292301927693, "learning_rate": 5.076828518746159e-06, "loss": 0.0002, "num_tokens": 3722011.0, "reward": 1.4906249046325684, "reward_std": 0.4104810357093811, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.4314318001270294, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05312499776482582, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07841909676790237, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 341.875, "completions/mean_terminated_length": 341.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.07655414130234274, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.00480498478282243, "learning_rate": 5.089121081745544e-06, "loss": 0.0002, "num_tokens": 3729186.0, "reward": 1.377173900604248, "reward_std": 0.40664049983024597, "rewards/fixed_code_pass_all_test_reward/mean": 0.2771739065647125, "rewards/fixed_code_pass_all_test_reward/std": 0.44820132851600647, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1511857956647873, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.07673860911270983, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.013404958830506075, "learning_rate": 5.101413644744931e-06, "loss": 0.0005, "num_tokens": 3733670.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.07692307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.004679436024161987, "learning_rate": 5.113706207744316e-06, "loss": 0.0002, "num_tokens": 3742847.0, "reward": 2.6299242973327637, "reward_std": 0.17882896959781647, "rewards/fixed_code_pass_all_test_reward/mean": 0.9924242496490479, "rewards/fixed_code_pass_all_test_reward/std": 0.010455549694597721, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.637499988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18468119204044342, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.07710754473344401, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.09370940961525775, "learning_rate": 5.125998770743701e-06, "loss": 0.0037, "num_tokens": 3750980.0, "reward": 1.866666555404663, "reward_std": 0.6151525378227234, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24166667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27357134222984314, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 674.375, "completions/mean_terminated_length": 674.375, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.0772920125438111, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.0013128605714882724, "learning_rate": 5.138291333743086e-06, "loss": 0.0001, "num_tokens": 3767839.0, "reward": 1.875, "reward_std": 0.30355045199394226, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.29880714416503906, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17728105187416077, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.0774764803541782, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.008457150950562209, "learning_rate": 5.150583896742471e-06, "loss": 0.0003, "num_tokens": 3777453.0, "reward": 1.532358169555664, "reward_std": 0.5330532193183899, "rewards/fixed_code_pass_all_test_reward/mean": 0.42819148302078247, "rewards/fixed_code_pass_all_test_reward/std": 0.490145742893219, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2291666567325592, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29784277081489563, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 484.25, "completions/mean_terminated_length": 484.25, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.07766094816454529, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.007723247690591961, "learning_rate": 5.1628764597418564e-06, "loss": 0.0003, "num_tokens": 3786279.0, "reward": 0.6499999761581421, "reward_std": 0.5424810647964478, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.07784541597491237, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.0028591918817255646, "learning_rate": 5.1751690227412424e-06, "loss": 0.0001, "num_tokens": 3790914.0, "reward": 1.2321429252624512, "reward_std": 0.7103463411331177, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1071428656578064, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10101525485515594, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 404.375, "completions/mean_terminated_length": 404.375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.07802988378527947, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.008565702941268682, "learning_rate": 5.187461585740628e-06, "loss": 0.0003, "num_tokens": 3804333.0, "reward": 1.15625, "reward_std": 0.6399986147880554, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 424.75, "completions/mean_terminated_length": 424.75, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.07821435159564656, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.0022402764661819674, "learning_rate": 5.199754148740013e-06, "loss": 0.0001, "num_tokens": 3814491.0, "reward": 1.28125, "reward_std": 0.2999255657196045, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.28125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2999255955219269, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 182.75, "completions/mean_terminated_length": 182.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.07839881940601365, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.0032083377154776827, "learning_rate": 5.212046711739398e-06, "loss": 0.0001, "num_tokens": 3818841.0, "reward": 0.800000011920929, "reward_std": 0.5126959681510925, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.07858328721638075, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.0029157616227166727, "learning_rate": 5.224339274738783e-06, "loss": 0.0001, "num_tokens": 3823466.0, "reward": 1.225000023841858, "reward_std": 0.40620189905166626, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10690450668334961, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 429.25, "completions/mean_terminated_length": 429.25, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.07876775502674783, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.006843961775302887, "learning_rate": 5.236631837738169e-06, "loss": 0.0003, "num_tokens": 3833828.0, "reward": 1.555842399597168, "reward_std": 0.44486865401268005, "rewards/fixed_code_pass_all_test_reward/mean": 0.5652173757553101, "rewards/fixed_code_pass_all_test_reward/std": 0.2229112982749939, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11562500149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14573551714420319, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.07895222283711492, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.0042276762396795675, "learning_rate": 5.248924400737554e-06, "loss": 0.0002, "num_tokens": 3840215.0, "reward": 0.7749999761581421, "reward_std": 0.48329228162765503, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.07913669064748201, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.012120215251343325, "learning_rate": 5.261216963736939e-06, "loss": 0.0005, "num_tokens": 3843901.0, "reward": 2.2125000953674316, "reward_std": 0.36425071954727173, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21250000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36425071954727173, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 444.875, "completions/mean_terminated_length": 215.85714721679688, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.07932115845784911, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.0021232616891211364, "learning_rate": 5.2735095267363245e-06, "loss": 0.0001, "num_tokens": 3850396.0, "reward": 1.1678571701049805, "reward_std": 0.7091960906982422, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04285714402794838, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08081220835447311, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.0795056262682162, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.0067113537807017565, "learning_rate": 5.28580208973571e-06, "loss": 0.0003, "num_tokens": 3854548.0, "reward": 0.9750000238418579, "reward_std": 0.8647873401641846, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2828427255153656, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 302.125, "completions/mean_terminated_length": 302.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.07969009407858328, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.00698285351973027, "learning_rate": 5.298094652735095e-06, "loss": 0.0003, "num_tokens": 3862597.0, "reward": 1.6791667938232422, "reward_std": 0.5119696259498596, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.23224268853664398, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.42916667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20270982384681702, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 623.75, "completions/mean_terminated_length": 623.75, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.07987456188895038, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.004637513498892076, "learning_rate": 5.310387215734482e-06, "loss": 0.0002, "num_tokens": 3874723.0, "reward": 1.0, "reward_std": 0.5092790126800537, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1385153830051422, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.08005902969931747, "frac_reward_zero_std": 1.0, "grad_norm": 0.2158203125, "kl": 0.009178727166727185, "learning_rate": 5.322679778733867e-06, "loss": 0.0004, "num_tokens": 3884746.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 357.75, "completions/mean_terminated_length": 357.75, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.08024349750968456, "frac_reward_zero_std": 1.0, "grad_norm": 0.045166015625, "kl": 0.00500155083136633, "learning_rate": 5.334972341733252e-06, "loss": 0.0002, "num_tokens": 3892424.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 274.5, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.08042796532005166, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.007273981405887753, "learning_rate": 5.347264904732637e-06, "loss": 0.0003, "num_tokens": 3900556.0, "reward": 1.1729166507720947, "reward_std": 0.15784701704978943, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17291666567325592, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15784700214862823, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.08061243313041874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.008048689982388169, "learning_rate": 5.359557467732023e-06, "loss": 0.0003, "num_tokens": 3910807.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.08079690094078583, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.004172776389168575, "learning_rate": 5.371850030731408e-06, "loss": 0.0002, "num_tokens": 3915532.0, "reward": 1.024999976158142, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 461.375, "completions/mean_terminated_length": 461.375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.08098136875115293, "frac_reward_zero_std": 0.0, "grad_norm": 0.90234375, "kl": 0.0034479626192478463, "learning_rate": 5.384142593730793e-06, "loss": 0.0001, "num_tokens": 3924903.0, "reward": 1.6041666269302368, "reward_std": 0.4060797691345215, "rewards/fixed_code_pass_all_test_reward/mean": 0.6041666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.4060797691345215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.08116583656152002, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.007364308403339237, "learning_rate": 5.3964351567301786e-06, "loss": 0.0003, "num_tokens": 3930833.0, "reward": 1.249431848526001, "reward_std": 0.4331008791923523, "rewards/fixed_code_pass_all_test_reward/mean": 0.3181818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.42916133999824524, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05624999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10500850528478622, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 177.625, "completions/mean_terminated_length": 177.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.0813503043718871, "frac_reward_zero_std": 1.0, "grad_norm": 0.051513671875, "kl": 0.007002560596447438, "learning_rate": 5.408727719729564e-06, "loss": 0.0003, "num_tokens": 3938446.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 419.0, "completions/mean_terminated_length": 419.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.0815347721822542, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.006205364712513983, "learning_rate": 5.42102028272895e-06, "loss": 0.0002, "num_tokens": 3949334.0, "reward": 2.0176587104797363, "reward_std": 0.06495030224323273, "rewards/fixed_code_pass_all_test_reward/mean": 0.831250011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.07039429992437363, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18640872836112976, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09697160869836807, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.08171923999262129, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.00625326216686517, "learning_rate": 5.433312845728335e-06, "loss": 0.0003, "num_tokens": 3953536.0, "reward": 0.6428571343421936, "reward_std": 0.7824608087539673, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.01785714365541935, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.05050762742757797, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 356.0, "completions/mean_terminated_length": 356.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.08190370780298838, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.01361153824836947, "learning_rate": 5.44560540872772e-06, "loss": 0.0005, "num_tokens": 3961080.0, "reward": 1.4946705102920532, "reward_std": 0.5407335758209229, "rewards/fixed_code_pass_all_test_reward/mean": 0.8488372564315796, "rewards/fixed_code_pass_all_test_reward/std": 0.14655643701553345, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02083333395421505, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 366.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.08208817561335546, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.006909786548931152, "learning_rate": 5.457897971727105e-06, "loss": 0.0003, "num_tokens": 3971117.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 669.125, "completions/mean_terminated_length": 669.125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.08227264342372256, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.002819207467837259, "learning_rate": 5.47019053472649e-06, "loss": 0.0001, "num_tokens": 3989846.0, "reward": 1.1708333492279053, "reward_std": 0.5541151762008667, "rewards/fixed_code_pass_all_test_reward/mean": 0.1145833358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18125000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19988836348056793, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.08245711123408965, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.010604145849356428, "learning_rate": 5.4824830977258755e-06, "loss": 0.0004, "num_tokens": 3998728.0, "reward": 0.9458333253860474, "reward_std": 0.39357438683509827, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07083333283662796, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09829902648925781, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.08264157904445674, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.00839587877271697, "learning_rate": 5.4947756607252615e-06, "loss": 0.0003, "num_tokens": 4009373.0, "reward": 2.2955358028411865, "reward_std": 0.39399996399879456, "rewards/fixed_code_pass_all_test_reward/mean": 0.7562500238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.33640697598457336, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5392857193946838, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19172951579093933, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 385.125, "completions/mean_terminated_length": 385.125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.08282604685482384, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.004183960176305845, "learning_rate": 5.5070682237246475e-06, "loss": 0.0002, "num_tokens": 4018278.0, "reward": 1.6914077997207642, "reward_std": 0.4801311790943146, "rewards/fixed_code_pass_all_test_reward/mean": 0.44021737575531006, "rewards/fixed_code_pass_all_test_reward/std": 0.46563661098480225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2511904835700989, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14804087579250336, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.08301051466519092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0303955078125, "kl": 0.0035710194788407534, "learning_rate": 5.519360786724033e-06, "loss": 0.0001, "num_tokens": 4023809.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 313.625, "completions/mean_terminated_length": 313.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.08319498247555801, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.010879341294639744, "learning_rate": 5.531653349723418e-06, "loss": 0.0004, "num_tokens": 4034878.0, "reward": 1.3972222805023193, "reward_std": 0.2698570489883423, "rewards/fixed_code_pass_all_test_reward/mean": 0.2708333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.2947940528392792, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12638889253139496, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12156207859516144, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 365.125, "completions/mean_terminated_length": 365.125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.08337945028592511, "frac_reward_zero_std": 1.0, "grad_norm": 0.035888671875, "kl": 0.004857400374021381, "learning_rate": 5.543945912722804e-06, "loss": 0.0002, "num_tokens": 4042839.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.0835639180962922, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.004448103223694488, "learning_rate": 5.556238475722189e-06, "loss": 0.0002, "num_tokens": 4048925.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.08374838590665928, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.0052304187847767025, "learning_rate": 5.568531038721574e-06, "loss": 0.0002, "num_tokens": 4053413.0, "reward": 1.4583332538604736, "reward_std": 0.4813176095485687, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17457431554794312, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.08393285371702638, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.00793857709504664, "learning_rate": 5.580823601720959e-06, "loss": 0.0003, "num_tokens": 4060301.0, "reward": 2.5, "reward_std": 0.6928202509880066, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4200340211391449, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.08411732152739347, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0043219695799052715, "learning_rate": 5.593116164720344e-06, "loss": 0.0002, "num_tokens": 4064707.0, "reward": 2.2750000953674316, "reward_std": 0.5119988918304443, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5249999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1832250952720642, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.08430178933776056, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.004273270693374798, "learning_rate": 5.60540872771973e-06, "loss": 0.0002, "num_tokens": 4069281.0, "reward": 1.399999976158142, "reward_std": 0.5014265179634094, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 235.375, "completions/mean_terminated_length": 235.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.08448625714812766, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.007231062569189817, "learning_rate": 5.6177012907191156e-06, "loss": 0.0003, "num_tokens": 4077996.0, "reward": 1.5665063858032227, "reward_std": 0.21778105199337006, "rewards/fixed_code_pass_all_test_reward/mean": 0.32692307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.15791328251361847, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2395833432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12277787178754807, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 306.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.08467072495849474, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.0070892082585487515, "learning_rate": 5.629993853718501e-06, "loss": 0.0003, "num_tokens": 4087414.0, "reward": 1.7791666984558105, "reward_std": 0.21394819021224976, "rewards/fixed_code_pass_all_test_reward/mean": 0.7291666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.15905225276947021, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.08485519276886183, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.004673175106290728, "learning_rate": 5.642286416717886e-06, "loss": 0.0002, "num_tokens": 4094752.0, "reward": 1.0062499046325684, "reward_std": 0.4065864086151123, "rewards/fixed_code_pass_all_test_reward/mean": 0.13125000894069672, "rewards/fixed_code_pass_all_test_reward/std": 0.0530330128967762, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.08503966057922892, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.00388023913546931, "learning_rate": 5.654578979717271e-06, "loss": 0.0002, "num_tokens": 4098992.0, "reward": 1.65625, "reward_std": 0.5499594211578369, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 317.25, "completions/mean_terminated_length": 317.25, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.08522412838959602, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.006806510136811994, "learning_rate": 5.666871542716656e-06, "loss": 0.0003, "num_tokens": 4106938.0, "reward": 1.0729167461395264, "reward_std": 0.1368400603532791, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 364.375, "completions/mean_terminated_length": 364.375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.0854085961999631, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.004809187114005908, "learning_rate": 5.679164105716042e-06, "loss": 0.0002, "num_tokens": 4114837.0, "reward": 1.5952379703521729, "reward_std": 0.3289310932159424, "rewards/fixed_code_pass_all_test_reward/mean": 0.7202380895614624, "rewards/fixed_code_pass_all_test_reward/std": 0.09330520778894424, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.08559306401033019, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.006680142367258668, "learning_rate": 5.691456668715427e-06, "loss": 0.0003, "num_tokens": 4123573.0, "reward": 1.7083333730697632, "reward_std": 0.3857583701610565, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2708333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23464766144752502, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 235.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.08577753182069729, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.010773368005175143, "learning_rate": 5.703749231714813e-06, "loss": 0.0004, "num_tokens": 4130781.0, "reward": 2.1500000953674316, "reward_std": 0.32708433270454407, "rewards/fixed_code_pass_all_test_reward/mean": 0.8541666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.2260337918996811, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2958333492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15475274622440338, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 366.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.08596199963106438, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.007689983744057827, "learning_rate": 5.7160417947141985e-06, "loss": 0.0003, "num_tokens": 4138359.0, "reward": 1.3964285850524902, "reward_std": 0.32020044326782227, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333283662796, "rewards/fixed_code_pass_all_test_reward/std": 0.22271770238876343, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18809524178504944, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22507244348526, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.08614646744143147, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.00735924780019559, "learning_rate": 5.7283343577135845e-06, "loss": 0.0003, "num_tokens": 4147034.0, "reward": 1.5906250476837158, "reward_std": 0.49069589376449585, "rewards/fixed_code_pass_all_test_reward/mean": 0.140625, "rewards/fixed_code_pass_all_test_reward/std": 0.34856387972831726, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4500000476837158, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2632218301296234, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.08633093525179857, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.012361938133835793, "learning_rate": 5.74062692071297e-06, "loss": 0.0005, "num_tokens": 4153606.0, "reward": 1.8041666746139526, "reward_std": 0.5091036558151245, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05416666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11810339987277985, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.08651540306216565, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.008366180001758039, "learning_rate": 5.752919483712355e-06, "loss": 0.0003, "num_tokens": 4163061.0, "reward": 1.4833333492279053, "reward_std": 0.3664502203464508, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3583333492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12817399203777313, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.08669987087253274, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.007464759313734248, "learning_rate": 5.76521204671174e-06, "loss": 0.0003, "num_tokens": 4172586.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 466.875, "completions/mean_terminated_length": 466.875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.08688433868289984, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.004842883819947019, "learning_rate": 5.777504609711125e-06, "loss": 0.0002, "num_tokens": 4185921.0, "reward": 1.5645833015441895, "reward_std": 0.7977423667907715, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.41890934109687805, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.27291667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23986397683620453, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 544.75, "completions/mean_terminated_length": 544.75, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.08706880649326693, "frac_reward_zero_std": 1.0, "grad_norm": 0.048095703125, "kl": 0.006924803834408522, "learning_rate": 5.789797172710511e-06, "loss": 0.0003, "num_tokens": 4199903.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 525.25, "completions/mean_terminated_length": 525.25, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.08725327430363401, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.0034540566703071818, "learning_rate": 5.802089735709896e-06, "loss": 0.0001, "num_tokens": 4211625.0, "reward": 1.7472221851348877, "reward_std": 0.43236565589904785, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24722222983837128, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17715665698051453, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.08743774211400111, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.008461833844194189, "learning_rate": 5.814382298709281e-06, "loss": 0.0003, "num_tokens": 4222051.0, "reward": 1.5155303478240967, "reward_std": 0.11045841127634048, "rewards/fixed_code_pass_all_test_reward/mean": 0.13636364042758942, "rewards/fixed_code_pass_all_test_reward/std": 0.08416546881198883, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3791666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08533314615488052, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.0876222099243682, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0095863988972269, "learning_rate": 5.8266748617086665e-06, "loss": 0.0004, "num_tokens": 4230286.0, "reward": 1.747499942779541, "reward_std": 0.27442148327827454, "rewards/fixed_code_pass_all_test_reward/mean": 0.05999999865889549, "rewards/fixed_code_pass_all_test_reward/std": 0.11109840869903564, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2474873811006546, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 455.375, "completions/mean_terminated_length": 455.375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.08780667773473529, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.007515467761550099, "learning_rate": 5.838967424708052e-06, "loss": 0.0003, "num_tokens": 4244441.0, "reward": 1.5479166507720947, "reward_std": 0.6559517979621887, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4229166805744171, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3746890425682068, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.08799114554510237, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.008569833938963711, "learning_rate": 5.851259987707437e-06, "loss": 0.0003, "num_tokens": 4248852.0, "reward": 1.9791666269302368, "reward_std": 0.5348007678985596, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1458333283662796, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0907420963048935, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 185.5, "completions/mean_terminated_length": 185.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.08817561335546947, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.007839190831873566, "learning_rate": 5.863552550706823e-06, "loss": 0.0003, "num_tokens": 4253248.0, "reward": 1.7750000953674316, "reward_std": 0.5284749269485474, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2750000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14880476891994476, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 364.75, "completions/mean_terminated_length": 364.75, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.08836008116583656, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.006649859074968845, "learning_rate": 5.875845113706208e-06, "loss": 0.0003, "num_tokens": 4260814.0, "reward": 1.4672832489013672, "reward_std": 0.32169297337532043, "rewards/fixed_code_pass_all_test_reward/mean": 0.39540815353393555, "rewards/fixed_code_pass_all_test_reward/std": 0.30640172958374023, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07187499850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.141065776348114, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 237.5, "completions/mean_terminated_length": 237.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.08854454897620365, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.00548368992167525, "learning_rate": 5.888137676705593e-06, "loss": 0.0002, "num_tokens": 4265826.0, "reward": 1.3250000476837158, "reward_std": 0.33700355887413025, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20000000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21380901336669922, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 179.25, "completions/mean_terminated_length": 179.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.08872901678657075, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.021376343443989754, "learning_rate": 5.900430239704979e-06, "loss": 0.0009, "num_tokens": 4272324.0, "reward": 1.3156249523162842, "reward_std": 0.3786577582359314, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.31562501192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3786577582359314, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 342.25, "completions/mean_terminated_length": 342.25, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.08891348459693783, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.00710154214175418, "learning_rate": 5.912722802704365e-06, "loss": 0.0003, "num_tokens": 4282310.0, "reward": 1.7707890272140503, "reward_std": 0.4796023368835449, "rewards/fixed_code_pass_all_test_reward/mean": 0.5478723645210266, "rewards/fixed_code_pass_all_test_reward/std": 0.4417502284049988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22291666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11715459078550339, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.08909795240730492, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.013676907925400883, "learning_rate": 5.92501536570375e-06, "loss": 0.0005, "num_tokens": 4293429.0, "reward": 1.4024038314819336, "reward_std": 0.19145897030830383, "rewards/fixed_code_pass_all_test_reward/mean": 0.2211538404226303, "rewards/fixed_code_pass_all_test_reward/std": 0.15072380006313324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18125000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13076014816761017, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.08928242021767202, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.009448598138988018, "learning_rate": 5.9373079287031355e-06, "loss": 0.0004, "num_tokens": 4299576.0, "reward": 1.790865421295166, "reward_std": 0.36305469274520874, "rewards/fixed_code_pass_all_test_reward/mean": 0.634615421295166, "rewards/fixed_code_pass_all_test_reward/std": 0.24238181114196777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22903135418891907, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 546.375, "completions/mean_terminated_length": 331.8571472167969, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.08946688802803911, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.024015249488002155, "learning_rate": 5.949600491702521e-06, "loss": 0.001, "num_tokens": 4309235.0, "reward": 2.357142925262451, "reward_std": 1.049781322479248, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8571428656578064, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3499270975589752, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 419.0, "completions/mean_terminated_length": 419.0, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.0896513558384062, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.009348930499982089, "learning_rate": 5.961893054701906e-06, "loss": 0.0004, "num_tokens": 4321091.0, "reward": 0.8583333492279053, "reward_std": 0.5479398965835571, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10833333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15507295727729797, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.0898358236487733, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.014940517779905349, "learning_rate": 5.974185617701292e-06, "loss": 0.0006, "num_tokens": 4329128.0, "reward": 2.1062498092651367, "reward_std": 0.1898072212934494, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.18015369772911072, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.26249998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24458420276641846, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.09002029145914038, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.019260900269728154, "learning_rate": 5.986478180700677e-06, "loss": 0.0008, "num_tokens": 4337798.0, "reward": 1.498863697052002, "reward_std": 0.4874199330806732, "rewards/fixed_code_pass_all_test_reward/mean": 0.4488636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.4612732529640198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 617.5, "completions/mean_terminated_length": 617.5, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.09020475926950747, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.006729690590873361, "learning_rate": 5.998770743700062e-06, "loss": 0.0003, "num_tokens": 4353794.0, "reward": 0.9464285373687744, "reward_std": 0.39632052183151245, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.1079898551106453, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.09038922707987457, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.005567026317294221, "learning_rate": 6.011063306699447e-06, "loss": 0.0002, "num_tokens": 4358359.0, "reward": 1.024999976158142, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.09057369489024165, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.020251563691999763, "learning_rate": 6.023355869698832e-06, "loss": 0.0008, "num_tokens": 4364813.0, "reward": 1.448958396911621, "reward_std": 0.3004770576953888, "rewards/fixed_code_pass_all_test_reward/mean": 0.40833333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.3426090478897095, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04062499850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07784772664308548, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.09075816270060874, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.020141645916737616, "learning_rate": 6.0356484326982175e-06, "loss": 0.0008, "num_tokens": 4375072.0, "reward": 1.9562499523162842, "reward_std": 0.5864161849021912, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.32732683420181274, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.706250011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.32341647148132324, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.09094263051097584, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.012031996040605009, "learning_rate": 6.0479409956976035e-06, "loss": 0.0005, "num_tokens": 4383420.0, "reward": 1.7944711446762085, "reward_std": 0.30388861894607544, "rewards/fixed_code_pass_all_test_reward/mean": 0.7944711446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.3038886487483978, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 684.875, "completions/mean_terminated_length": 684.875, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.09112709832134293, "frac_reward_zero_std": 0.0, "grad_norm": 0.58984375, "kl": 0.005235915334196761, "learning_rate": 6.060233558696989e-06, "loss": 0.0002, "num_tokens": 4400723.0, "reward": 1.2604167461395264, "reward_std": 0.3286501467227936, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2604166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3286501467227936, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 410.5, "completions/mean_terminated_length": 410.5, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.09131156613171001, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.00814841638202779, "learning_rate": 6.072526121696374e-06, "loss": 0.0003, "num_tokens": 4411839.0, "reward": 1.131250023841858, "reward_std": 0.5243618488311768, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2562500238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2770217955112457, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 625.0, "completions/mean_terminated_length": 625.0, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.0914960339420771, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.004863691865466535, "learning_rate": 6.084818684695759e-06, "loss": 0.0002, "num_tokens": 4424055.0, "reward": 1.3878676891326904, "reward_std": 0.26863765716552734, "rewards/fixed_code_pass_all_test_reward/mean": 0.35661765933036804, "rewards/fixed_code_pass_all_test_reward/std": 0.24579383432865143, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.0916805017524442, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.013331853842828423, "learning_rate": 6.097111247695144e-06, "loss": 0.0005, "num_tokens": 4432599.0, "reward": 2.160416603088379, "reward_std": 0.4829586446285248, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2854166626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.31325700879096985, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 138.375, "completions/mean_terminated_length": 138.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.09186496956281129, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.016086074407212436, "learning_rate": 6.109403810694531e-06, "loss": 0.0006, "num_tokens": 4436378.0, "reward": 2.421875, "reward_std": 0.8937718272209167, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.671875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.44789937138557434, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.09204943737317837, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.00818149046972394, "learning_rate": 6.121696373693916e-06, "loss": 0.0003, "num_tokens": 4441555.0, "reward": 1.9861111640930176, "reward_std": 0.886260986328125, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3611111342906952, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19143937528133392, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.09223390518354548, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.016918018169235438, "learning_rate": 6.133988936693301e-06, "loss": 0.0007, "num_tokens": 4448668.0, "reward": 1.9663691520690918, "reward_std": 0.6980889439582825, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.591369092464447, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.288966566324234, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.09241837299391256, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.011682247393764555, "learning_rate": 6.1462814996926864e-06, "loss": 0.0005, "num_tokens": 4453094.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 366.375, "completions/mean_terminated_length": 366.375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.09260284080427965, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.007354113942710683, "learning_rate": 6.1585740626920724e-06, "loss": 0.0003, "num_tokens": 4461001.0, "reward": 1.625, "reward_std": 0.40839704871177673, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.40839704871177673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 311.125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.09278730861464675, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.011091607273556292, "learning_rate": 6.170866625691458e-06, "loss": 0.0004, "num_tokens": 4467586.0, "reward": 1.4482758045196533, "reward_std": 0.5851917862892151, "rewards/fixed_code_pass_all_test_reward/mean": 0.5732758641242981, "rewards/fixed_code_pass_all_test_reward/std": 0.23163843154907227, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.09297177642501384, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.01172676237183623, "learning_rate": 6.183159188690843e-06, "loss": 0.0005, "num_tokens": 4474556.0, "reward": 1.381250023841858, "reward_std": 0.4956507086753845, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13124999403953552, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1869635283946991, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.09315624423538092, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.018529567285440862, "learning_rate": 6.195451751690228e-06, "loss": 0.0007, "num_tokens": 4482632.0, "reward": 1.840144157409668, "reward_std": 0.34869858622550964, "rewards/fixed_code_pass_all_test_reward/mean": 0.04326923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.05971721187233925, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.796875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3892386257648468, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.09334071204574802, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.011983491713181138, "learning_rate": 6.207744314689613e-06, "loss": 0.0005, "num_tokens": 4486409.0, "reward": 2.674999952316284, "reward_std": 0.6923046708106995, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.800000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3545621335506439, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 345.125, "completions/mean_terminated_length": 345.125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.09352517985611511, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0051440691167954355, "learning_rate": 6.220036877688998e-06, "loss": 0.0002, "num_tokens": 4492418.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.0937096476664822, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.024820629507303238, "learning_rate": 6.232329440688384e-06, "loss": 0.001, "num_tokens": 4500806.0, "reward": 1.2083333730697632, "reward_std": 0.39591169357299805, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0833333358168602, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022911310196, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0938941154768493, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.007130802056053653, "learning_rate": 6.244622003687769e-06, "loss": 0.0003, "num_tokens": 4505421.0, "reward": 1.875, "reward_std": 0.5230405330657959, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2121320515871048, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 362.125, "completions/mean_terminated_length": 362.125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.09407858328721638, "frac_reward_zero_std": 1.0, "grad_norm": 0.05078125, "kl": 0.008220591465942562, "learning_rate": 6.2569145666871545e-06, "loss": 0.0003, "num_tokens": 4513406.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.09426305109758347, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.012849050282966346, "learning_rate": 6.26920712968654e-06, "loss": 0.0005, "num_tokens": 4523321.0, "reward": 2.136458396911621, "reward_std": 0.25788894295692444, "rewards/fixed_code_pass_all_test_reward/mean": 0.8031250238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.25788891315460205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.09444751890795056, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.02081751369405538, "learning_rate": 6.281499692685925e-06, "loss": 0.0008, "num_tokens": 4530894.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.09463198671831766, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.016173036070540547, "learning_rate": 6.29379225568531e-06, "loss": 0.0006, "num_tokens": 4539088.0, "reward": 2.713235378265381, "reward_std": 0.2626107633113861, "rewards/fixed_code_pass_all_test_reward/mean": 0.8382353186607361, "rewards/fixed_code_pass_all_test_reward/std": 0.2736581563949585, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18322506546974182, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 241.5, "completions/mean_terminated_length": 241.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.09481645452868474, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.01340669981436804, "learning_rate": 6.306084818684697e-06, "loss": 0.0005, "num_tokens": 4544884.0, "reward": 1.5098683834075928, "reward_std": 0.39390575885772705, "rewards/fixed_code_pass_all_test_reward/mean": 0.6348683834075928, "rewards/fixed_code_pass_all_test_reward/std": 0.2656601071357727, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.09500092233905183, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.009709439967991784, "learning_rate": 6.318377381684082e-06, "loss": 0.0004, "num_tokens": 4549688.0, "reward": 1.679464340209961, "reward_std": 0.7912902235984802, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17946428060531616, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1627276986837387, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.09518539014941893, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.014970872784033418, "learning_rate": 6.330669944683467e-06, "loss": 0.0006, "num_tokens": 4553608.0, "reward": 0.7864583134651184, "reward_std": 0.7491521239280701, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0364583358168602, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.06842003017663956, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 660.375, "completions/mean_terminated_length": 660.375, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.09536985795978602, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.007427244738209993, "learning_rate": 6.342962507682853e-06, "loss": 0.0003, "num_tokens": 4568115.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 453.875, "completions/mean_terminated_length": 453.875, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.0955543257701531, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.010454386589117348, "learning_rate": 6.355255070682238e-06, "loss": 0.0004, "num_tokens": 4576938.0, "reward": 1.2142857313156128, "reward_std": 0.3239695131778717, "rewards/fixed_code_pass_all_test_reward/mean": 0.3392857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.4107697308063507, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 478.75, "completions/mean_terminated_length": 254.57144165039062, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.0957387935805202, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.024725150142330676, "learning_rate": 6.367547633681623e-06, "loss": 0.001, "num_tokens": 4587768.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 253.0, "completions/mean_terminated_length": 253.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.09592326139088729, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.015151565778069198, "learning_rate": 6.3798401966810086e-06, "loss": 0.0006, "num_tokens": 4595936.0, "reward": 1.4178571701049805, "reward_std": 0.37737905979156494, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.29285717010498047, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.31794407963752747, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 461.75, "completions/mean_terminated_length": 461.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.09610772920125438, "frac_reward_zero_std": 0.0, "grad_norm": 0.70703125, "kl": 0.00998893560608849, "learning_rate": 6.392132759680394e-06, "loss": 0.0004, "num_tokens": 4607454.0, "reward": 2.022916793823242, "reward_std": 0.467426061630249, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.14791667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22632986307144165, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 263.625, "completions/mean_terminated_length": 263.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.09629219701162148, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.016220767225604504, "learning_rate": 6.404425322679779e-06, "loss": 0.0006, "num_tokens": 4616859.0, "reward": 1.6927709579467773, "reward_std": 0.14937537908554077, "rewards/fixed_code_pass_all_test_reward/mean": 0.6927710771560669, "rewards/fixed_code_pass_all_test_reward/std": 0.14937537908554077, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.09647666482198856, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.015252106415573508, "learning_rate": 6.416717885679165e-06, "loss": 0.0006, "num_tokens": 4627878.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 377.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.09666113263235565, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.004980442114174366, "learning_rate": 6.42901044867855e-06, "loss": 0.0002, "num_tokens": 4634416.0, "reward": 1.6218750476837158, "reward_std": 0.5614773035049438, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24687500298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17445707321166992, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.09684560044272275, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.016181703540496528, "learning_rate": 6.441303011677935e-06, "loss": 0.0006, "num_tokens": 4642265.0, "reward": 1.8125, "reward_std": 0.33330363035202026, "rewards/fixed_code_pass_all_test_reward/mean": 0.5708333253860474, "rewards/fixed_code_pass_all_test_reward/std": 0.3614071011543274, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24166665971279144, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13183684647083282, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 429.5, "completions/mean_terminated_length": 429.5, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.09703006825308984, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.013237189967185259, "learning_rate": 6.45359557467732e-06, "loss": 0.0005, "num_tokens": 4654485.0, "reward": 1.9944443702697754, "reward_std": 0.25078362226486206, "rewards/fixed_code_pass_all_test_reward/mean": 0.7861111164093018, "rewards/fixed_code_pass_all_test_reward/std": 0.13435454666614532, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.09721453606345692, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.009643136407248676, "learning_rate": 6.4658881376767055e-06, "loss": 0.0004, "num_tokens": 4663045.0, "reward": 2.050595283508301, "reward_std": 0.16319887340068817, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.08625821024179459, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1547619104385376, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14057128131389618, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 605.375, "completions/mean_terminated_length": 399.2857360839844, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.09739900387382401, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.00728736755263526, "learning_rate": 6.4781807006760915e-06, "loss": 0.0003, "num_tokens": 4674960.0, "reward": 1.4657924175262451, "reward_std": 0.7184256911277771, "rewards/fixed_code_pass_all_test_reward/mean": 0.517578125, "rewards/fixed_code_pass_all_test_reward/std": 0.5172416567802429, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07321428507566452, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08275064080953598, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 264.5, "completions/mean_terminated_length": 264.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.09758347168419111, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.016006456688046455, "learning_rate": 6.490473263675477e-06, "loss": 0.0006, "num_tokens": 4682804.0, "reward": 1.6979167461395264, "reward_std": 0.5038295388221741, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 199.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.0977679394945582, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.027162911894265562, "learning_rate": 6.502765826674863e-06, "loss": 0.0011, "num_tokens": 4692483.0, "reward": 1.8673913478851318, "reward_std": 0.37426450848579407, "rewards/fixed_code_pass_all_test_reward/mean": 0.2590579688549042, "rewards/fixed_code_pass_all_test_reward/std": 0.3823009431362152, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6083333492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.32562515139579773, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.09795240730492528, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.03220325673464686, "learning_rate": 6.515058389674248e-06, "loss": 0.0013, "num_tokens": 4701459.0, "reward": 2.125, "reward_std": 0.6923046708106995, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.5014265775680542, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7250000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.337003618478775, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.09813687511529239, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.025292708771303296, "learning_rate": 6.527350952673634e-06, "loss": 0.001, "num_tokens": 4709703.0, "reward": 1.5208333730697632, "reward_std": 0.4666454493999481, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5208333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.46664541959762573, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.09832134292565947, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.016021956980694085, "learning_rate": 6.539643515673019e-06, "loss": 0.0006, "num_tokens": 4713724.0, "reward": 2.0124998092651367, "reward_std": 0.42151933908462524, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13749998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1157275140285492, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.09850581073602656, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.029198663076385856, "learning_rate": 6.551936078672404e-06, "loss": 0.0012, "num_tokens": 4721117.0, "reward": 2.224679470062256, "reward_std": 0.3462364971637726, "rewards/fixed_code_pass_all_test_reward/mean": 0.7788461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.3523559272289276, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.44583332538604736, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2921934127807617, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 321.125, "completions/mean_terminated_length": 321.125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.09869027854639366, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.00949045637389645, "learning_rate": 6.564228641671789e-06, "loss": 0.0004, "num_tokens": 4728334.0, "reward": 1.0416667461395264, "reward_std": 0.11785116046667099, "rewards/fixed_code_pass_all_test_reward/mean": 0.0416666679084301, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511455655098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.09887474635676075, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.014269422157667577, "learning_rate": 6.576521204671174e-06, "loss": 0.0006, "num_tokens": 4736712.0, "reward": 2.512500047683716, "reward_std": 0.5986592173576355, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.637499988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4405759274959564, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 240.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.09905921416712783, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.03594012965913862, "learning_rate": 6.5888137676705596e-06, "loss": 0.0014, "num_tokens": 4744596.0, "reward": 1.8229167461395264, "reward_std": 0.42360299825668335, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4479166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35894879698753357, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 485.5, "completions/mean_terminated_length": 485.5, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.09924368197749493, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.010496319242520258, "learning_rate": 6.6011063306699456e-06, "loss": 0.0004, "num_tokens": 4757672.0, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.09942814978786202, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.021167909842915833, "learning_rate": 6.613398893669331e-06, "loss": 0.0008, "num_tokens": 4766681.0, "reward": 2.352083206176758, "reward_std": 0.5725674629211426, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.33662354946136475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7583333253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.347211092710495, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.0996126175982291, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.008300724148284644, "learning_rate": 6.625691456668716e-06, "loss": 0.0003, "num_tokens": 4771784.0, "reward": 1.850000023841858, "reward_std": 0.47509393095970154, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10690450668334961, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.0997970854085962, "frac_reward_zero_std": 0.0, "grad_norm": 3.765625, "kl": 0.020759329432621598, "learning_rate": 6.637984019668101e-06, "loss": 0.0008, "num_tokens": 4775623.0, "reward": 1.4625000953674316, "reward_std": 0.2973093092441559, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4625000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29730936884880066, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.09998155321896329, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.016750554146710783, "learning_rate": 6.650276582667486e-06, "loss": 0.0007, "num_tokens": 4785260.0, "reward": 2.25, "reward_std": 0.23335231840610504, "rewards/fixed_code_pass_all_test_reward/mean": 0.28125, "rewards/fixed_code_pass_all_test_reward/std": 0.19097672402858734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 253.0, "completions/mean_terminated_length": 253.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.10016602102933038, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.02026432822458446, "learning_rate": 6.662569145666872e-06, "loss": 0.0008, "num_tokens": 4794956.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 362.25, "completions/mean_terminated_length": 362.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.10035048883969747, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "kl": 0.012909231649246067, "learning_rate": 6.674861708666257e-06, "loss": 0.0005, "num_tokens": 4802902.0, "reward": 1.140625, "reward_std": 0.0867956355214119, "rewards/fixed_code_pass_all_test_reward/mean": 0.140625, "rewards/fixed_code_pass_all_test_reward/std": 0.0867956355214119, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.10053495665006457, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.014221336925402284, "learning_rate": 6.6871542716656425e-06, "loss": 0.0006, "num_tokens": 4810462.0, "reward": 0.90625, "reward_std": 0.376485139131546, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.10071942446043165, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.008886056981282309, "learning_rate": 6.699446834665028e-06, "loss": 0.0004, "num_tokens": 4815028.0, "reward": 2.1229166984558105, "reward_std": 0.3803651034832001, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24791666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07737637311220169, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.10090389227079874, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.03885711473412812, "learning_rate": 6.7117393976644145e-06, "loss": 0.0016, "num_tokens": 4820439.0, "reward": 1.9322917461395264, "reward_std": 0.47031813859939575, "rewards/fixed_code_pass_all_test_reward/mean": 0.765625, "rewards/fixed_code_pass_all_test_reward/std": 0.43526214361190796, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19416078925132751, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.10108836008116584, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.03589719021692872, "learning_rate": 6.7240319606638e-06, "loss": 0.0014, "num_tokens": 4829201.0, "reward": 2.03125, "reward_std": 0.7610788941383362, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.65625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39949744939804077, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.10127282789153293, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.016340418544132262, "learning_rate": 6.736324523663185e-06, "loss": 0.0007, "num_tokens": 4837302.0, "reward": 1.1166666746139526, "reward_std": 0.13213753700256348, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1321374922990799, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.10145729570190001, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.009668886137660593, "learning_rate": 6.74861708666257e-06, "loss": 0.0004, "num_tokens": 4842161.0, "reward": 1.4887819290161133, "reward_std": 0.497643381357193, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11378205567598343, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17233765125274658, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.10164176351226711, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.030221675406210124, "learning_rate": 6.760909649661955e-06, "loss": 0.0012, "num_tokens": 4851490.0, "reward": 1.8893229961395264, "reward_std": 0.4405342936515808, "rewards/fixed_code_pass_all_test_reward/mean": 0.31640625, "rewards/fixed_code_pass_all_test_reward/std": 0.2762135863304138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5729166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27973026037216187, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.1018262313226342, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.007971877610543743, "learning_rate": 6.77320221266134e-06, "loss": 0.0003, "num_tokens": 4855904.0, "reward": 2.2604167461395264, "reward_std": 0.15169353783130646, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2604166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15169349312782288, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.10201069913300129, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.03714943618979305, "learning_rate": 6.785494775660726e-06, "loss": 0.0015, "num_tokens": 4862804.0, "reward": 1.9322917461395264, "reward_std": 0.6369639039039612, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6822916865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4476224184036255, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.10219516694336839, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.025254888110794127, "learning_rate": 6.797787338660111e-06, "loss": 0.001, "num_tokens": 4870069.0, "reward": 2.808333396911621, "reward_std": 0.34994328022003174, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9333333373069763, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12848320603370667, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 226.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.10237963475373547, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.016983077279292047, "learning_rate": 6.8100799016594965e-06, "loss": 0.0007, "num_tokens": 4875920.0, "reward": 1.2135416269302368, "reward_std": 0.5881761312484741, "rewards/fixed_code_pass_all_test_reward/mean": 0.1197916641831398, "rewards/fixed_code_pass_all_test_reward/std": 0.33882200717926025, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.263287752866745, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.10256410256410256, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.029242778662592173, "learning_rate": 6.822372464658882e-06, "loss": 0.0012, "num_tokens": 4885556.0, "reward": 1.7916667461395264, "reward_std": 0.3053750991821289, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3053750991821289, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 122.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.10274857037446966, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.013174662250094116, "learning_rate": 6.834665027658267e-06, "loss": 0.0005, "num_tokens": 4889410.0, "reward": 2.2750000953674316, "reward_std": 0.9676923751831055, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6499999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3338091969490051, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.10293303818483675, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.03575339203234762, "learning_rate": 6.846957590657653e-06, "loss": 0.0014, "num_tokens": 4896182.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.10311750599520383, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.025707965716719627, "learning_rate": 6.859250153657038e-06, "loss": 0.001, "num_tokens": 4901286.0, "reward": 1.3979166746139526, "reward_std": 0.5795754790306091, "rewards/fixed_code_pass_all_test_reward/mean": 0.38333332538604736, "rewards/fixed_code_pass_all_test_reward/std": 0.25385910272598267, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13958333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2363642454147339, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 311.75, "completions/mean_terminated_length": 311.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.10330197380557093, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.01242942939279601, "learning_rate": 6.871542716656423e-06, "loss": 0.0005, "num_tokens": 4908380.0, "reward": 1.834821343421936, "reward_std": 0.16414976119995117, "rewards/fixed_code_pass_all_test_reward/mean": 0.8348214626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.16414979100227356, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 469.0, "completions/mean_terminated_length": 469.0, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.10348644161593802, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.016845720121636987, "learning_rate": 6.883835279655808e-06, "loss": 0.0007, "num_tokens": 4920796.0, "reward": 1.058333396911621, "reward_std": 0.41623789072036743, "rewards/fixed_code_pass_all_test_reward/mean": 0.3083333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.22236107289791107, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.10367090942630511, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.010123583604581654, "learning_rate": 6.8961278426551934e-06, "loss": 0.0004, "num_tokens": 4925501.0, "reward": 1.15625, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 110.875, "completions/mean_terminated_length": 110.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.1038553772366722, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.042233023792505264, "learning_rate": 6.90842040565458e-06, "loss": 0.0017, "num_tokens": 4933436.0, "reward": 1.8645834922790527, "reward_std": 0.3462885320186615, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.02227177284657955, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.84375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35197150707244873, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 726.25, "completions/mean_terminated_length": 726.25, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.1040398450470393, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.005389863072196022, "learning_rate": 6.9207129686539654e-06, "loss": 0.0002, "num_tokens": 4949782.0, "reward": 1.625, "reward_std": 0.3053750991821289, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3053751289844513, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.10422431285740638, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.017441508796764538, "learning_rate": 6.933005531653351e-06, "loss": 0.0007, "num_tokens": 4956240.0, "reward": 2.0920138359069824, "reward_std": 0.4200744926929474, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2795138955116272, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20803385972976685, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 676.5, "completions/mean_terminated_length": 676.5, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.10440878066777347, "frac_reward_zero_std": 0.0, "grad_norm": 0.71484375, "kl": 0.00655378692317754, "learning_rate": 6.945298094652736e-06, "loss": 0.0003, "num_tokens": 4972212.0, "reward": 2.285416603088379, "reward_std": 0.6356747150421143, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7229167222976685, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.384102463722229, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.10459324847814057, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.011100360658019781, "learning_rate": 6.957590657652121e-06, "loss": 0.0004, "num_tokens": 4978589.0, "reward": 2.686011791229248, "reward_std": 0.0980716347694397, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6860119104385376, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09807168692350388, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.10477771628850766, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.01389246468897909, "learning_rate": 6.969883220651507e-06, "loss": 0.0006, "num_tokens": 4982760.0, "reward": 1.5499999523162842, "reward_std": 0.4985693693161011, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 303.625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.10496218409887474, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.010290982434526086, "learning_rate": 6.982175783650892e-06, "loss": 0.0004, "num_tokens": 4989565.0, "reward": 2.1972827911376953, "reward_std": 0.18343405425548553, "rewards/fixed_code_pass_all_test_reward/mean": 0.8097826242446899, "rewards/fixed_code_pass_all_test_reward/std": 0.07685943692922592, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.38749998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1642080694437027, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.10514665190924184, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.02376055857166648, "learning_rate": 6.994468346650277e-06, "loss": 0.0009, "num_tokens": 4998097.0, "reward": 2.2208333015441895, "reward_std": 0.10493770241737366, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22083333134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10493762791156769, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.10533111971960893, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.009408530953805894, "learning_rate": 7.006760909649662e-06, "loss": 0.0004, "num_tokens": 5002500.0, "reward": 1.085714340209961, "reward_std": 0.12121830880641937, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08571428805589676, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12121831625699997, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.10551558752997602, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.010523514705710113, "learning_rate": 7.0190534726490475e-06, "loss": 0.0004, "num_tokens": 5007085.0, "reward": 1.985714316368103, "reward_std": 0.623354971408844, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.23571428656578064, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19897699356079102, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 241.625, "completions/mean_terminated_length": 241.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.10570005534034312, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.03283955343067646, "learning_rate": 7.0313460356484335e-06, "loss": 0.0013, "num_tokens": 5018306.0, "reward": 1.568750023841858, "reward_std": 0.5049310326576233, "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, "rewards/fixed_code_pass_all_test_reward/std": 0.10690450668334961, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.46875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4712730050086975, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1058845231507102, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.06987947644665837, "learning_rate": 7.043638598647819e-06, "loss": 0.0028, "num_tokens": 5025458.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.10606899096107729, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.041142215952277184, "learning_rate": 7.055931161647204e-06, "loss": 0.0016, "num_tokens": 5034865.0, "reward": 2.533928632736206, "reward_std": 0.4323824346065521, "rewards/fixed_code_pass_all_test_reward/mean": 0.9047619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.19216933846473694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6291666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.41991594433784485, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 125.875, "completions/mean_terminated_length": 125.875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.10625345877144439, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.03415529103949666, "learning_rate": 7.068223724646589e-06, "loss": 0.0014, "num_tokens": 5039320.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 526.0, "completions/mean_terminated_length": 526.0, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.10643792658181148, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.013369846157729626, "learning_rate": 7.080516287645974e-06, "loss": 0.0005, "num_tokens": 5053824.0, "reward": 0.9567307829856873, "reward_std": 1.3385874032974243, "rewards/fixed_code_pass_all_test_reward/mean": 0.26923078298568726, "rewards/fixed_code_pass_all_test_reward/std": 0.39864522218704224, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.10662239439217856, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0633984743617475, "learning_rate": 7.092808850645359e-06, "loss": 0.0025, "num_tokens": 5060844.0, "reward": 2.0625, "reward_std": 0.4537104666233063, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27368009090423584, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.10680686220254565, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.03065081313252449, "learning_rate": 7.105101413644746e-06, "loss": 0.0012, "num_tokens": 5069713.0, "reward": 2.125, "reward_std": 0.5824823379516602, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.10699133001291275, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.038989087799564004, "learning_rate": 7.117393976644131e-06, "loss": 0.0016, "num_tokens": 5077877.0, "reward": 2.3614864349365234, "reward_std": 0.26613911986351013, "rewards/fixed_code_pass_all_test_reward/mean": 0.7364864945411682, "rewards/fixed_code_pass_all_test_reward/std": 0.23727117478847504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.10717579782327984, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.038289712741971016, "learning_rate": 7.1296865396435164e-06, "loss": 0.0015, "num_tokens": 5086453.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 257.25, "completions/mean_terminated_length": 257.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.10736026563364692, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.00885611993726343, "learning_rate": 7.141979102642902e-06, "loss": 0.0004, "num_tokens": 5092423.0, "reward": 2.9000000953674316, "reward_std": 0.2828427255153656, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8999999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2828427255153656, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.10754473344401402, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.03824321646243334, "learning_rate": 7.154271665642288e-06, "loss": 0.0015, "num_tokens": 5098554.0, "reward": 2.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 336.75, "completions/mean_terminated_length": 336.75, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.10772920125438111, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.012342206086032093, "learning_rate": 7.166564228641673e-06, "loss": 0.0005, "num_tokens": 5109976.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.1079136690647482, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.022929065278731287, "learning_rate": 7.178856791641058e-06, "loss": 0.0009, "num_tokens": 5115846.0, "reward": 1.1800000667572021, "reward_std": 0.17717359960079193, "rewards/fixed_code_pass_all_test_reward/mean": 0.054999999701976776, "rewards/fixed_code_pass_all_test_reward/std": 0.01414213515818119, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.1080981368751153, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.033498691162094474, "learning_rate": 7.191149354640443e-06, "loss": 0.0013, "num_tokens": 5124247.0, "reward": 1.22265625, "reward_std": 0.6581472158432007, "rewards/fixed_code_pass_all_test_reward/mean": 0.34765625, "rewards/fixed_code_pass_all_test_reward/std": 0.45697641372680664, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.10828260468548238, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.04189664463046938, "learning_rate": 7.203441917639828e-06, "loss": 0.0017, "num_tokens": 5134289.0, "reward": 1.6773648262023926, "reward_std": 0.43987542390823364, "rewards/fixed_code_pass_all_test_reward/mean": 0.6773648262023926, "rewards/fixed_code_pass_all_test_reward/std": 0.4398754835128784, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.10846707249584947, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.04294752184068784, "learning_rate": 7.215734480639214e-06, "loss": 0.0017, "num_tokens": 5143190.0, "reward": 1.5625, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4955156147480011, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 377.125, "completions/mean_terminated_length": 377.125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.10865154030621657, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.015478167741093785, "learning_rate": 7.228027043638599e-06, "loss": 0.0006, "num_tokens": 5151335.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.10883600811658366, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.005122561240568757, "learning_rate": 7.2403196066379845e-06, "loss": 0.0002, "num_tokens": 5156361.0, "reward": 2.450000047683716, "reward_std": 0.14142127335071564, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.44999998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 542.75, "completions/mean_terminated_length": 542.75, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.10902047592695074, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.005845645209774375, "learning_rate": 7.25261216963737e-06, "loss": 0.0002, "num_tokens": 5166855.0, "reward": 1.9722222089767456, "reward_std": 0.051434475928545, "rewards/fixed_code_pass_all_test_reward/mean": 0.9722222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.05143444985151291, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.10920494373731784, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.035966287017799914, "learning_rate": 7.264904732636755e-06, "loss": 0.0014, "num_tokens": 5173076.0, "reward": 0.9001811742782593, "reward_std": 0.47684553265571594, "rewards/fixed_code_pass_all_test_reward/mean": 0.054347824305295944, "rewards/fixed_code_pass_all_test_reward/std": 0.12019877880811691, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09583333134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14847104251384735, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.10938941154768493, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.03816897701472044, "learning_rate": 7.27719729563614e-06, "loss": 0.0015, "num_tokens": 5181481.0, "reward": 1.7625000476837158, "reward_std": 0.4068607687950134, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2121320515871048, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 259.75, "completions/mean_terminated_length": 259.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.10957387935805202, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.022806130349636078, "learning_rate": 7.289489858635526e-06, "loss": 0.0009, "num_tokens": 5187535.0, "reward": 1.2950000762939453, "reward_std": 0.20860077440738678, "rewards/fixed_code_pass_all_test_reward/mean": 0.18250000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.007071066647768021, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11249999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21001701056957245, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.1097583471684191, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.05451415036804974, "learning_rate": 7.301782421634912e-06, "loss": 0.0022, "num_tokens": 5197016.0, "reward": 1.875, "reward_std": 0.3494894802570343, "rewards/fixed_code_pass_all_test_reward/mean": 0.4749999940395355, "rewards/fixed_code_pass_all_test_reward/std": 0.32403701543807983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4000000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19272483885288239, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.1099428149787862, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.06505821878090501, "learning_rate": 7.314074984634297e-06, "loss": 0.0026, "num_tokens": 5204750.0, "reward": 1.9583333730697632, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4583333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.11012728278915329, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.01625425519887358, "learning_rate": 7.326367547633682e-06, "loss": 0.0007, "num_tokens": 5209613.0, "reward": 1.320698857307434, "reward_std": 0.690812349319458, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07069892436265945, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09167618304491043, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.11031175059952038, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.014353133621625602, "learning_rate": 7.338660110633068e-06, "loss": 0.0006, "num_tokens": 5213506.0, "reward": 1.9312500953674316, "reward_std": 0.6076521277427673, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18125000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22668024897575378, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.11049621840988748, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.03187446226365864, "learning_rate": 7.350952673632453e-06, "loss": 0.0013, "num_tokens": 5220056.0, "reward": 1.4505763053894043, "reward_std": 0.6011616587638855, "rewards/fixed_code_pass_all_test_reward/mean": 0.5026595592498779, "rewards/fixed_code_pass_all_test_reward/std": 0.2031051367521286, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 494.5, "completions/mean_terminated_length": 494.5, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.11068068622025456, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.030982364201918244, "learning_rate": 7.3632452366318386e-06, "loss": 0.0012, "num_tokens": 5235100.0, "reward": 1.3928570747375488, "reward_std": 0.3949388265609741, "rewards/fixed_code_pass_all_test_reward/mean": 0.3928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.3949388265609741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 323.75, "completions/mean_terminated_length": 323.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.11086515403062165, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.023737480980344117, "learning_rate": 7.375537799631224e-06, "loss": 0.0009, "num_tokens": 5241962.0, "reward": 1.4839743375778198, "reward_std": 0.573258638381958, "rewards/fixed_code_pass_all_test_reward/mean": 0.567307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.3583032786846161, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 190.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.11104962184098875, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.033775414573028684, "learning_rate": 7.387830362630609e-06, "loss": 0.0014, "num_tokens": 5250761.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.11123408965135584, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.040771269239485264, "learning_rate": 7.400122925629995e-06, "loss": 0.0016, "num_tokens": 5256889.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.11141855746172293, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.04431301145814359, "learning_rate": 7.41241548862938e-06, "loss": 0.0018, "num_tokens": 5266453.0, "reward": 1.7922297716140747, "reward_std": 0.4151931405067444, "rewards/fixed_code_pass_all_test_reward/mean": 0.9172297716140747, "rewards/fixed_code_pass_all_test_reward/std": 0.11453791707754135, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 469.625, "completions/mean_terminated_length": 469.625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.11160302527209003, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.024165194714441895, "learning_rate": 7.424708051628765e-06, "loss": 0.001, "num_tokens": 5275914.0, "reward": 1.375, "reward_std": 0.11785111576318741, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511306643486, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 431.375, "completions/mean_terminated_length": 431.375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.11178749308245711, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.027288693585433066, "learning_rate": 7.43700061462815e-06, "loss": 0.0011, "num_tokens": 5286693.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1119719608928242, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.04041301645338535, "learning_rate": 7.4492931776275355e-06, "loss": 0.0016, "num_tokens": 5291847.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 475.0, "completions/mean_terminated_length": 475.0, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.1121564287031913, "frac_reward_zero_std": 0.0, "grad_norm": 0.703125, "kl": 0.02457394532393664, "learning_rate": 7.461585740626921e-06, "loss": 0.001, "num_tokens": 5305703.0, "reward": 2.1354167461395264, "reward_std": 1.0019203424453735, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7604166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38688188791275024, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.11234089651355839, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.05193824600428343, "learning_rate": 7.473878303626307e-06, "loss": 0.0021, "num_tokens": 5314357.0, "reward": 1.9583333730697632, "reward_std": 0.3421454429626465, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7083333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.263523131608963, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 373.125, "completions/mean_terminated_length": 373.125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.11252536432392547, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.03768195060547441, "learning_rate": 7.486170866625692e-06, "loss": 0.0015, "num_tokens": 5324662.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 365.75, "completions/mean_terminated_length": 365.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.11270983213429256, "frac_reward_zero_std": 1.0, "grad_norm": 0.03564453125, "kl": 0.014097794890403748, "learning_rate": 7.498463429625077e-06, "loss": 0.0006, "num_tokens": 5333012.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 286.5, "completions/mean_terminated_length": 286.5, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.11289429994465966, "frac_reward_zero_std": 0.0, "grad_norm": 0.65234375, "kl": 0.024473309400491416, "learning_rate": 7.510755992624463e-06, "loss": 0.001, "num_tokens": 5342688.0, "reward": 2.3522727489471436, "reward_std": 0.11785121262073517, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 442.125, "completions/mean_terminated_length": 442.125, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.11307876775502675, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.01967685658019036, "learning_rate": 7.523048555623849e-06, "loss": 0.0008, "num_tokens": 5356241.0, "reward": 1.6666667461395264, "reward_std": 0.5634361505508423, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5416666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.501980185508728, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.11326323556539383, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.030449879122897983, "learning_rate": 7.535341118623234e-06, "loss": 0.0012, "num_tokens": 5364223.0, "reward": 2.5833334922790527, "reward_std": 0.7292091846466064, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8333333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3563483655452728, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.11344770337576093, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0256505316356197, "learning_rate": 7.547633681622619e-06, "loss": 0.001, "num_tokens": 5370291.0, "reward": 1.5494506359100342, "reward_std": 0.37853461503982544, "rewards/fixed_code_pass_all_test_reward/mean": 0.5494505763053894, "rewards/fixed_code_pass_all_test_reward/std": 0.37853461503982544, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.11363217118612802, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.026566117187030613, "learning_rate": 7.559926244622004e-06, "loss": 0.0011, "num_tokens": 5378603.0, "reward": 2.05078125, "reward_std": 0.8852930068969727, "rewards/fixed_code_pass_all_test_reward/mean": 0.30078125, "rewards/fixed_code_pass_all_test_reward/std": 0.3344678282737732, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.1138166389964951, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.03561583301052451, "learning_rate": 7.5722188076213895e-06, "loss": 0.0014, "num_tokens": 5387770.0, "reward": 2.6041667461395264, "reward_std": 0.48744142055511475, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7291666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3078019917011261, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.1140011068068622, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.033530432265251875, "learning_rate": 7.5845113706207755e-06, "loss": 0.0013, "num_tokens": 5398092.0, "reward": 2.196666717529297, "reward_std": 0.6825442910194397, "rewards/fixed_code_pass_all_test_reward/mean": 0.6299999952316284, "rewards/fixed_code_pass_all_test_reward/std": 0.5107976794242859, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6916666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39671269059181213, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 248.5, "completions/mean_terminated_length": 248.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1141855746172293, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.02939809230156243, "learning_rate": 7.596803933620161e-06, "loss": 0.0012, "num_tokens": 5407016.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.11437004242759638, "frac_reward_zero_std": 1.0, "grad_norm": 0.255859375, "kl": 0.050110804149881005, "learning_rate": 7.609096496619546e-06, "loss": 0.002, "num_tokens": 5417357.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.11455451023796348, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.01252962538274005, "learning_rate": 7.621389059618931e-06, "loss": 0.0005, "num_tokens": 5421374.0, "reward": 1.9500000476837158, "reward_std": 0.3162277340888977, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.11473897804833057, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.05119718401692808, "learning_rate": 7.633681622618316e-06, "loss": 0.002, "num_tokens": 5429039.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.11492344585869765, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.010541982715949416, "learning_rate": 7.645974185617702e-06, "loss": 0.0004, "num_tokens": 5432939.0, "reward": 2.393749952316284, "reward_std": 0.5684801340103149, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.518750011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22350695729255676, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.11510791366906475, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.06170850759372115, "learning_rate": 7.658266748617086e-06, "loss": 0.0025, "num_tokens": 5440148.0, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39591163396835327, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 636.625, "completions/mean_terminated_length": 636.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.11529238147943184, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.01835584855871275, "learning_rate": 7.670559311616472e-06, "loss": 0.0007, "num_tokens": 5456321.0, "reward": 1.4675925970077515, "reward_std": 0.42399492859840393, "rewards/fixed_code_pass_all_test_reward/mean": 0.09259258955717087, "rewards/fixed_code_pass_all_test_reward/std": 0.09898564219474792, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4432026445865631, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 387.125, "completions/mean_terminated_length": 387.125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.11547684928979893, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.015926650492474437, "learning_rate": 7.682851874615857e-06, "loss": 0.0006, "num_tokens": 5466434.0, "reward": 1.5812499523162842, "reward_std": 0.11319232732057571, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08124999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11319231241941452, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 240.375, "completions/mean_terminated_length": 240.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.11566131710016603, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.04630450648255646, "learning_rate": 7.695144437615243e-06, "loss": 0.0019, "num_tokens": 5474941.0, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39591163396835327, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.11584578491053311, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.022758341277949512, "learning_rate": 7.707437000614629e-06, "loss": 0.0009, "num_tokens": 5478819.0, "reward": 1.2062499523162842, "reward_std": 0.6868340969085693, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08124999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11319231986999512, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 581.25, "completions/mean_terminated_length": 581.25, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.1160302527209002, "frac_reward_zero_std": 0.0, "grad_norm": 0.74609375, "kl": 0.01326482102740556, "learning_rate": 7.719729563614015e-06, "loss": 0.0005, "num_tokens": 5490333.0, "reward": 1.1458332538604736, "reward_std": 0.058925557881593704, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.11621472053126729, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.014536608126945794, "learning_rate": 7.732022126613399e-06, "loss": 0.0006, "num_tokens": 5494877.0, "reward": 1.4500000476837158, "reward_std": 0.4242641031742096, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.32500001788139343, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1832250952720642, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.11639918834163439, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.021751164691522717, "learning_rate": 7.744314689612785e-06, "loss": 0.0009, "num_tokens": 5502334.0, "reward": 1.7727272510528564, "reward_std": 0.3275522291660309, "rewards/fixed_code_pass_all_test_reward/mean": 0.7414772510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.30125120282173157, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 203.5, "completions/mean_terminated_length": 203.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.11658365615200147, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.03417786257341504, "learning_rate": 7.756607252612171e-06, "loss": 0.0014, "num_tokens": 5507818.0, "reward": 1.576923131942749, "reward_std": 0.45228826999664307, "rewards/fixed_code_pass_all_test_reward/mean": 0.5769230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.45228826999664307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 209.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.11676812396236856, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.010126265697181225, "learning_rate": 7.768899815611555e-06, "loss": 0.0004, "num_tokens": 5512439.0, "reward": 2.625, "reward_std": 1.0606601238250732, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 265.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.11695259177273566, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.05011038086377084, "learning_rate": 7.781192378610941e-06, "loss": 0.002, "num_tokens": 5522795.0, "reward": 1.875, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.11713705958310275, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.035488858353346586, "learning_rate": 7.793484941610326e-06, "loss": 0.0014, "num_tokens": 5532137.0, "reward": 1.9041666984558105, "reward_std": 0.3438519537448883, "rewards/fixed_code_pass_all_test_reward/mean": 0.550000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2777460217475891, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3541666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2062034010887146, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 469.75, "completions/mean_terminated_length": 469.75, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.11732152739346983, "frac_reward_zero_std": 0.0, "grad_norm": 0.59765625, "kl": 0.01812728471122682, "learning_rate": 7.805777504609712e-06, "loss": 0.0007, "num_tokens": 5544999.0, "reward": 2.5625, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 561.5, "completions/mean_terminated_length": 561.5, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.11750599520383694, "frac_reward_zero_std": 0.0, "grad_norm": 0.6953125, "kl": 0.01948946452466771, "learning_rate": 7.818070067609098e-06, "loss": 0.0008, "num_tokens": 5558867.0, "reward": 1.7724359035491943, "reward_std": 0.8082226514816284, "rewards/fixed_code_pass_all_test_reward/mean": 0.23076924681663513, "rewards/fixed_code_pass_all_test_reward/std": 0.14243386685848236, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4714045524597168, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 323.375, "completions/mean_terminated_length": 323.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.11769046301420402, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.008599384571425617, "learning_rate": 7.830362630608482e-06, "loss": 0.0003, "num_tokens": 5565510.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.11787493082457111, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.03964581526815891, "learning_rate": 7.842655193607868e-06, "loss": 0.0016, "num_tokens": 5575997.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.11805939863493821, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.010144032392418012, "learning_rate": 7.854947756607252e-06, "loss": 0.0004, "num_tokens": 5582508.0, "reward": 2.488888740539551, "reward_std": 0.12272622436285019, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4888888895511627, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12272623926401138, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 291.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.1182438664453053, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.010169679735554382, "learning_rate": 7.867240319606638e-06, "loss": 0.0004, "num_tokens": 5587854.0, "reward": 0.9625000357627869, "reward_std": 0.6323143839836121, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21250000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2531938850879669, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.11842833425567238, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.061237980145961046, "learning_rate": 7.879532882606024e-06, "loss": 0.0024, "num_tokens": 5598297.0, "reward": 2.0916666984558105, "reward_std": 0.44676971435546875, "rewards/fixed_code_pass_all_test_reward/mean": 0.6124999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.42573466897010803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4791666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.058925561606884, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 522.25, "completions/mean_terminated_length": 522.25, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.11861280206603948, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.01640582422260195, "learning_rate": 7.891825445605409e-06, "loss": 0.0007, "num_tokens": 5610731.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.11879726987640657, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.031847690464928746, "learning_rate": 7.904118008604795e-06, "loss": 0.0013, "num_tokens": 5616276.0, "reward": 1.1666667461395264, "reward_std": 0.15430331230163574, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.15430335700511932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.11898173768677366, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.031633765203878284, "learning_rate": 7.91641057160418e-06, "loss": 0.0013, "num_tokens": 5622392.0, "reward": 2.0416667461395264, "reward_std": 0.11785121262073517, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.11916620549714074, "frac_reward_zero_std": 1.0, "grad_norm": 0.3046875, "kl": 0.03451478842180222, "learning_rate": 7.928703134603567e-06, "loss": 0.0014, "num_tokens": 5626249.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.11935067330750784, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.047601852915249765, "learning_rate": 7.940995697602951e-06, "loss": 0.0019, "num_tokens": 5632664.0, "reward": 1.0801630020141602, "reward_std": 0.483837366104126, "rewards/fixed_code_pass_all_test_reward/mean": 0.17391303181648254, "rewards/fixed_code_pass_all_test_reward/std": 0.17391304671764374, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 426.625, "completions/mean_terminated_length": 426.625, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.11953514111787493, "frac_reward_zero_std": 1.0, "grad_norm": 0.04248046875, "kl": 0.013985996250994503, "learning_rate": 7.953288260602337e-06, "loss": 0.0006, "num_tokens": 5641917.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.11971960892824202, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.018848940962925553, "learning_rate": 7.965580823601721e-06, "loss": 0.0008, "num_tokens": 5645813.0, "reward": 2.7249999046325684, "reward_std": 0.5119988918304443, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8500000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3505098223686218, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 274.875, "completions/mean_terminated_length": 274.875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.11990407673860912, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.04889643285423517, "learning_rate": 7.977873386601107e-06, "loss": 0.002, "num_tokens": 5655948.0, "reward": 2.1022727489471436, "reward_std": 1.085960030555725, "rewards/fixed_code_pass_all_test_reward/mean": 0.6022727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.3915805220603943, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 356.125, "completions/mean_terminated_length": 356.125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.1200885445489762, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.024039389682002366, "learning_rate": 7.990165949600493e-06, "loss": 0.001, "num_tokens": 5664101.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.12027301235934329, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.05526845529675484, "learning_rate": 8.002458512599878e-06, "loss": 0.0022, "num_tokens": 5670174.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 504.5, "completions/mean_terminated_length": 504.5, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.12045748016971039, "frac_reward_zero_std": 0.0, "grad_norm": 0.65234375, "kl": 0.007096717279637232, "learning_rate": 8.014751075599264e-06, "loss": 0.0003, "num_tokens": 5679658.0, "reward": 2.4537696838378906, "reward_std": 0.2225305289030075, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2070196568965912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.603769838809967, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11941120028495789, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.12064194798007748, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.037915556342341006, "learning_rate": 8.027043638598648e-06, "loss": 0.0015, "num_tokens": 5686318.0, "reward": 1.2958333492279053, "reward_std": 0.2757026255130768, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.15526476502418518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10833333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15507295727729797, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 139.625, "completions/mean_terminated_length": 139.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.12082641579044456, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.01587042777100578, "learning_rate": 8.039336201598034e-06, "loss": 0.0006, "num_tokens": 5690147.0, "reward": 1.957291603088379, "reward_std": 0.3957972526550293, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08229167014360428, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09024190902709961, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 378.25, "completions/mean_terminated_length": 378.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.12101088360081166, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.024394327774643898, "learning_rate": 8.051628764597418e-06, "loss": 0.001, "num_tokens": 5700845.0, "reward": 2.03125, "reward_std": 0.1473139524459839, "rewards/fixed_code_pass_all_test_reward/mean": 0.3645833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.1473139226436615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.12119535141117875, "frac_reward_zero_std": 0.0, "grad_norm": 3.125, "kl": 0.057894143741577864, "learning_rate": 8.063921327596804e-06, "loss": 0.0023, "num_tokens": 5707858.0, "reward": 2.125, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.12137981922154584, "frac_reward_zero_std": 1.0, "grad_norm": 0.1982421875, "kl": 0.05941069801338017, "learning_rate": 8.07621389059619e-06, "loss": 0.0024, "num_tokens": 5713506.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 449.25, "completions/mean_terminated_length": 449.25, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.12156428703191294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.01615696382941678, "learning_rate": 8.088506453595574e-06, "loss": 0.0006, "num_tokens": 5722212.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.12174875484228002, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.02951958228368312, "learning_rate": 8.10079901659496e-06, "loss": 0.0012, "num_tokens": 5729886.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 129.375, "completions/mean_terminated_length": 129.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.12193322265264711, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.06821266794577241, "learning_rate": 8.113091579594346e-06, "loss": 0.0027, "num_tokens": 5735833.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.1221176904630142, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.050211408524774015, "learning_rate": 8.125384142593732e-06, "loss": 0.002, "num_tokens": 5745024.0, "reward": 2.125480890274048, "reward_std": 0.49178773164749146, "rewards/fixed_code_pass_all_test_reward/mean": 0.26923078298568726, "rewards/fixed_code_pass_all_test_reward/std": 0.45228826999664307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.856249988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.282131552696228, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.1223021582733813, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.01844236464239657, "learning_rate": 8.137676705593117e-06, "loss": 0.0007, "num_tokens": 5751539.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.12248662608374838, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.025372046045958996, "learning_rate": 8.149969268592503e-06, "loss": 0.001, "num_tokens": 5761599.0, "reward": 2.5625, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 412.25, "completions/mean_terminated_length": 412.25, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.12267109389411547, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.016272289911285043, "learning_rate": 8.162261831591887e-06, "loss": 0.0007, "num_tokens": 5773961.0, "reward": 1.951612949371338, "reward_std": 0.8475254774093628, "rewards/fixed_code_pass_all_test_reward/mean": 0.2016129195690155, "rewards/fixed_code_pass_all_test_reward/std": 0.32107943296432495, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.12285556170448257, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0353060441557318, "learning_rate": 8.174554394591273e-06, "loss": 0.0014, "num_tokens": 5780437.0, "reward": 2.5687499046325684, "reward_std": 0.2837443947792053, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6520833373069763, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12261616438627243, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.12304002951484966, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.015522319241426885, "learning_rate": 8.186846957590659e-06, "loss": 0.0006, "num_tokens": 5784593.0, "reward": 1.3312499523162842, "reward_std": 0.7713335156440735, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08124999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11319231986999512, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 210.75, "completions/mean_terminated_length": 210.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.12322449732521674, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.022825254884082824, "learning_rate": 8.199139520590043e-06, "loss": 0.0009, "num_tokens": 5789687.0, "reward": 2.262946605682373, "reward_std": 0.5478752851486206, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3879464268684387, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25362488627433777, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.12340896513558385, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.02768389298580587, "learning_rate": 8.21143208358943e-06, "loss": 0.0011, "num_tokens": 5795056.0, "reward": 1.6416666507720947, "reward_std": 0.3544502556324005, "rewards/fixed_code_pass_all_test_reward/mean": 0.5583333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.2958710789680481, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0833333358168602, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15430335700511932, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 109.625, "completions/mean_terminated_length": 109.625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.12359343294595093, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.025298007996752858, "learning_rate": 8.223724646588814e-06, "loss": 0.001, "num_tokens": 5798629.0, "reward": 2.4937500953674316, "reward_std": 0.6077500581741333, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6187499761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25995078682899475, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 491.0, "completions/mean_terminated_length": 491.0, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.12377790075631802, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.012095066660549492, "learning_rate": 8.2360172095882e-06, "loss": 0.0005, "num_tokens": 5808285.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.12396236856668512, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.007635523827048019, "learning_rate": 8.248309772587586e-06, "loss": 0.0003, "num_tokens": 5814668.0, "reward": 2.183333396911621, "reward_std": 0.3142135441303253, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.3053751289844513, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3916666507720947, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18665817379951477, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 266.75, "completions/mean_terminated_length": 266.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.1241468363770522, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.028377910144627094, "learning_rate": 8.26060233558697e-06, "loss": 0.0011, "num_tokens": 5822442.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.12433130418741929, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.02430231822654605, "learning_rate": 8.272894898586356e-06, "loss": 0.001, "num_tokens": 5829156.0, "reward": 1.0833332538604736, "reward_std": 0.5136238932609558, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2815771996974945, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 323.625, "completions/mean_terminated_length": 323.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.12451577199778639, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.018784209969453514, "learning_rate": 8.28518746158574e-06, "loss": 0.0008, "num_tokens": 5836057.0, "reward": 1.7291667461395264, "reward_std": 0.2579910457134247, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.20795641839504242, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1458333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2736801207065582, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.12470023980815348, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.04263047070708126, "learning_rate": 8.297480024585126e-06, "loss": 0.0017, "num_tokens": 5843702.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.12488470761852057, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.018253799993544817, "learning_rate": 8.309772587584512e-06, "loss": 0.0007, "num_tokens": 5849231.0, "reward": 2.049999952316284, "reward_std": 1.1940327882766724, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.550000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4750939607620239, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.12506917542888765, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.05464906292036176, "learning_rate": 8.322065150583898e-06, "loss": 0.0022, "num_tokens": 5856622.0, "reward": 1.59375, "reward_std": 0.7063171863555908, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.71875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.41052016615867615, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 249.25, "completions/mean_terminated_length": 249.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.12525364323925475, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.009604634484276175, "learning_rate": 8.334357713583283e-06, "loss": 0.0004, "num_tokens": 5862032.0, "reward": 2.221978187561035, "reward_std": 0.38874927163124084, "rewards/fixed_code_pass_all_test_reward/mean": 0.8250000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3969780206680298, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36677613854408264, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 193.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.12543811104962185, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.05871613975614309, "learning_rate": 8.346650276582669e-06, "loss": 0.0023, "num_tokens": 5869605.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.12562257885998893, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.00676296220626682, "learning_rate": 8.358942839582055e-06, "loss": 0.0003, "num_tokens": 5874545.0, "reward": 2.808333396911621, "reward_std": 0.14001141488552094, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8083333373069763, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14001132547855377, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.12580704667035603, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.0080991773866117, "learning_rate": 8.371235402581439e-06, "loss": 0.0003, "num_tokens": 5879120.0, "reward": 2.675595283508301, "reward_std": 0.38721418380737305, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8005952835083008, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2861950397491455, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 387.625, "completions/mean_terminated_length": 387.625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.12599151448072313, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.018559135496616364, "learning_rate": 8.383527965580825e-06, "loss": 0.0007, "num_tokens": 5891597.0, "reward": 1.2657406330108643, "reward_std": 0.27353549003601074, "rewards/fixed_code_pass_all_test_reward/mean": 0.07407407462596893, "rewards/fixed_code_pass_all_test_reward/std": 0.1252080351114273, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.19166666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22590206563472748, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.1261759822910902, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0471159138251096, "learning_rate": 8.39582052858021e-06, "loss": 0.0019, "num_tokens": 5898362.0, "reward": 2.4375, "reward_std": 0.2625803053379059, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2625803053379059, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.1263604501014573, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.018787355162203312, "learning_rate": 8.408113091579595e-06, "loss": 0.0008, "num_tokens": 5906331.0, "reward": 1.9166666269302368, "reward_std": 0.4653041362762451, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3413325846195221, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 302.5, "completions/mean_terminated_length": 302.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.12654491791182437, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.0452215934637934, "learning_rate": 8.42040565457898e-06, "loss": 0.0018, "num_tokens": 5913559.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 116.375, "completions/mean_terminated_length": 116.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.12672938572219147, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.05536330514587462, "learning_rate": 8.432698217578365e-06, "loss": 0.0022, "num_tokens": 5919778.0, "reward": 2.0625, "reward_std": 0.4172614812850952, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.12691385353255857, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.006545850163092837, "learning_rate": 8.444990780577751e-06, "loss": 0.0003, "num_tokens": 5924491.0, "reward": 2.1812500953674316, "reward_std": 0.3463457226753235, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18125000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3463457524776459, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 326.875, "completions/mean_terminated_length": 326.875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.12709832134292565, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.025679271668195724, "learning_rate": 8.457283343577136e-06, "loss": 0.001, "num_tokens": 5936514.0, "reward": 1.982007622718811, "reward_std": 0.17655335366725922, "rewards/fixed_code_pass_all_test_reward/mean": 0.6486742496490479, "rewards/fixed_code_pass_all_test_reward/std": 0.17655335366725922, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.12728278915329275, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.03542121429927647, "learning_rate": 8.469575906576522e-06, "loss": 0.0014, "num_tokens": 5943313.0, "reward": 2.2083334922790527, "reward_std": 1.006920576095581, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8333333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3563483655452728, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.12746725696365985, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.028911778470501304, "learning_rate": 8.481868469575906e-06, "loss": 0.0012, "num_tokens": 5951447.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.12765172477402692, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.03413540328620002, "learning_rate": 8.494161032575292e-06, "loss": 0.0014, "num_tokens": 5955428.0, "reward": 1.859375, "reward_std": 0.48438796401023865, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.234375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21083910763263702, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.12783619258439402, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.023477197566535324, "learning_rate": 8.506453595574678e-06, "loss": 0.0009, "num_tokens": 5960122.0, "reward": 2.29300594329834, "reward_std": 0.2084120213985443, "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.33988094329833984, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12738652527332306, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 126.875, "completions/mean_terminated_length": 126.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.12802066039476112, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.03248547646217048, "learning_rate": 8.518746158574064e-06, "loss": 0.0013, "num_tokens": 5968185.0, "reward": 2.763157844543457, "reward_std": 0.2531948983669281, "rewards/fixed_code_pass_all_test_reward/mean": 0.7631579041481018, "rewards/fixed_code_pass_all_test_reward/std": 0.2531948685646057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1282051282051282, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.05490748840384185, "learning_rate": 8.531038721573448e-06, "loss": 0.0022, "num_tokens": 5977706.0, "reward": 1.8796361684799194, "reward_std": 0.1320623755455017, "rewards/fixed_code_pass_all_test_reward/mean": 0.6338028311729431, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1320624053478241, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 283.125, "completions/mean_terminated_length": 283.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.1283895960154953, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.021841136913280934, "learning_rate": 8.543331284572834e-06, "loss": 0.0009, "num_tokens": 5984043.0, "reward": 1.0777311325073242, "reward_std": 0.17915129661560059, "rewards/fixed_code_pass_all_test_reward/mean": 0.07773109525442123, "rewards/fixed_code_pass_all_test_reward/std": 0.17915132641792297, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 334.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.1285740638258624, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.037095611449331045, "learning_rate": 8.55562384757222e-06, "loss": 0.0015, "num_tokens": 5992960.0, "reward": 1.625, "reward_std": 0.11785116046667099, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 343.0, "completions/mean_terminated_length": 343.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.12875853163622947, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.008574498700909317, "learning_rate": 8.567916410571605e-06, "loss": 0.0003, "num_tokens": 6001232.0, "reward": 1.3824999332427979, "reward_std": 0.3956097960472107, "rewards/fixed_code_pass_all_test_reward/mean": 0.6324999332427979, "rewards/fixed_code_pass_all_test_reward/std": 0.2791697084903717, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.12894299944659657, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.02859505359083414, "learning_rate": 8.58020897357099e-06, "loss": 0.0011, "num_tokens": 6005122.0, "reward": 1.9249999523162842, "reward_std": 0.3845219612121582, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.12912746725696367, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.04251883493270725, "learning_rate": 8.592501536570375e-06, "loss": 0.0017, "num_tokens": 6013796.0, "reward": 1.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.12931193506733074, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.04447661410085857, "learning_rate": 8.604794099569761e-06, "loss": 0.0018, "num_tokens": 6021478.0, "reward": 1.875, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.12949640287769784, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.02717140130698681, "learning_rate": 8.617086662569147e-06, "loss": 0.0011, "num_tokens": 6028289.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 216.875, "completions/mean_terminated_length": 216.875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.12968087068806494, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.041202254593372345, "learning_rate": 8.629379225568531e-06, "loss": 0.0016, "num_tokens": 6034128.0, "reward": 2.055107593536377, "reward_std": 0.29801151156425476, "rewards/fixed_code_pass_all_test_reward/mean": 0.9301075339317322, "rewards/fixed_code_pass_all_test_reward/std": 0.19768576323986053, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.12986533849843201, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.03991409228183329, "learning_rate": 8.641671788567917e-06, "loss": 0.0016, "num_tokens": 6043293.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.13004980630879912, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.0556339044123888, "learning_rate": 8.653964351567302e-06, "loss": 0.0022, "num_tokens": 6051023.0, "reward": 2.009999990463257, "reward_std": 0.02138087898492813, "rewards/fixed_code_pass_all_test_reward/mean": 0.009999999776482582, "rewards/fixed_code_pass_all_test_reward/std": 0.021380899474024773, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.13023427411916622, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.033618730143643916, "learning_rate": 8.666256914566688e-06, "loss": 0.0013, "num_tokens": 6058997.0, "reward": 1.8184524774551392, "reward_std": 0.27055519819259644, "rewards/fixed_code_pass_all_test_reward/mean": 0.5267857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.27779197692871094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.5, "completions/mean_terminated_length": 241.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.1304187419295333, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.02346839033998549, "learning_rate": 8.678549477566074e-06, "loss": 0.0009, "num_tokens": 6070001.0, "reward": 2.5416667461395264, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.1306032097399004, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.0679163804743439, "learning_rate": 8.690842040565458e-06, "loss": 0.0027, "num_tokens": 6075928.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 561.75, "completions/mean_terminated_length": 561.75, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.1307876775502675, "frac_reward_zero_std": 0.0, "grad_norm": 0.59375, "kl": 0.011073893518187106, "learning_rate": 8.703134603564844e-06, "loss": 0.0004, "num_tokens": 6087278.0, "reward": 1.1458332538604736, "reward_std": 0.058925557881593704, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 305.0, "completions/mean_terminated_length": 305.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.13097214536063456, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.025873622158542275, "learning_rate": 8.71542716656423e-06, "loss": 0.001, "num_tokens": 6094174.0, "reward": 1.7843283414840698, "reward_std": 0.3211923837661743, "rewards/fixed_code_pass_all_test_reward/mean": 0.7593283653259277, "rewards/fixed_code_pass_all_test_reward/std": 0.29490768909454346, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.13115661317100166, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.040709978668019176, "learning_rate": 8.727719729563616e-06, "loss": 0.0016, "num_tokens": 6103104.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.13134108098136876, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.06550338957458735, "learning_rate": 8.740012292563e-06, "loss": 0.0026, "num_tokens": 6112375.0, "reward": 2.8187499046325684, "reward_std": 0.2724721133708954, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8500000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2777460217475891, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.13152554879173584, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.020419040694832802, "learning_rate": 8.752304855562386e-06, "loss": 0.0008, "num_tokens": 6122031.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.13171001660210294, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.01771109999390319, "learning_rate": 8.76459741856177e-06, "loss": 0.0007, "num_tokens": 6126943.0, "reward": 1.7145832777023315, "reward_std": 0.8935144543647766, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21458333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23049937188625336, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 83.5, "completions/mean_terminated_length": 83.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.13189448441247004, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.0735640712082386, "learning_rate": 8.776889981561157e-06, "loss": 0.0029, "num_tokens": 6132307.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.1320789522228371, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.039763202279573306, "learning_rate": 8.78918254456054e-06, "loss": 0.0016, "num_tokens": 6142493.0, "reward": 2.25, "reward_std": 0.6606874465942383, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4520675837993622, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.1322634200332042, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.054406506242230535, "learning_rate": 8.801475107559927e-06, "loss": 0.0022, "num_tokens": 6149153.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 373.125, "completions/mean_terminated_length": 373.125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.1324478878435713, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.017079977493267506, "learning_rate": 8.813767670559313e-06, "loss": 0.0007, "num_tokens": 6157506.0, "reward": 1.8928571939468384, "reward_std": 0.44490382075309753, "rewards/fixed_code_pass_all_test_reward/mean": 0.3928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.12296092510223389, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38172540068626404, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 505.625, "completions/mean_terminated_length": 505.625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.13263235565393838, "frac_reward_zero_std": 0.0, "grad_norm": 0.86328125, "kl": 0.013132393185514957, "learning_rate": 8.826060233558697e-06, "loss": 0.0005, "num_tokens": 6167783.0, "reward": 1.6666667461395264, "reward_std": 0.17817413806915283, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.13281682346430548, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.04755099234171212, "learning_rate": 8.838352796558083e-06, "loss": 0.0019, "num_tokens": 6175879.0, "reward": 2.106060743331909, "reward_std": 0.1265076845884323, "rewards/fixed_code_pass_all_test_reward/mean": 0.14772728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.032141219824552536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.13300129127467256, "frac_reward_zero_std": 0.0, "grad_norm": 4.09375, "kl": 0.022249098808970302, "learning_rate": 8.850645359557467e-06, "loss": 0.0009, "num_tokens": 6181068.0, "reward": 0.8999999761581421, "reward_std": 0.37032803893089294, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.13318575908503966, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.027961244340986013, "learning_rate": 8.862937922556853e-06, "loss": 0.0011, "num_tokens": 6189942.0, "reward": 2.5546875, "reward_std": 0.36164847016334534, "rewards/fixed_code_pass_all_test_reward/mean": 0.8046875, "rewards/fixed_code_pass_all_test_reward/std": 0.36164847016334534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.13337022689540676, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.06365575431846082, "learning_rate": 8.87523048555624e-06, "loss": 0.0025, "num_tokens": 6197983.0, "reward": 2.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.13355469470577383, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.01633906620554626, "learning_rate": 8.887523048555624e-06, "loss": 0.0007, "num_tokens": 6201980.0, "reward": 1.9636904001235962, "reward_std": 0.39957815408706665, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08869047462940216, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09655342251062393, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 226.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.13373916251614093, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.010974725300911814, "learning_rate": 8.89981561155501e-06, "loss": 0.0004, "num_tokens": 6207076.0, "reward": 1.9249999523162842, "reward_std": 0.5035586953163147, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.550000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16903086006641388, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.13392363032650803, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.019973056972958148, "learning_rate": 8.912108174554396e-06, "loss": 0.0008, "num_tokens": 6216504.0, "reward": 2.2291667461395264, "reward_std": 0.28434693813323975, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2291666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.28434693813323975, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 248.75, "completions/mean_terminated_length": 248.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.1341080981368751, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.06251358985900879, "learning_rate": 8.924400737553782e-06, "loss": 0.0025, "num_tokens": 6225694.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.1342925659472422, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.05591600434854627, "learning_rate": 8.936693300553166e-06, "loss": 0.0022, "num_tokens": 6237320.0, "reward": 1.233173131942749, "reward_std": 0.5829753279685974, "rewards/fixed_code_pass_all_test_reward/mean": 0.32692307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.3083783686161041, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.1344770337576093, "frac_reward_zero_std": 1.0, "grad_norm": 0.0546875, "kl": 0.014996653830166906, "learning_rate": 8.948985863552552e-06, "loss": 0.0006, "num_tokens": 6245975.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 196.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.13466150156797638, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.05932371434755623, "learning_rate": 8.961278426551936e-06, "loss": 0.0024, "num_tokens": 6254373.0, "reward": 1.7234649658203125, "reward_std": 0.1786496490240097, "rewards/fixed_code_pass_all_test_reward/mean": 0.656798243522644, "rewards/fixed_code_pass_all_test_reward/std": 0.1434830278158188, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06666667014360428, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12848322093486786, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.13484596937834348, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.019699201453477144, "learning_rate": 8.973570989551322e-06, "loss": 0.0008, "num_tokens": 6258771.0, "reward": 2.6229166984558105, "reward_std": 0.6711123585700989, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7479166388511658, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3342175781726837, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.13503043718871058, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.07237462978810072, "learning_rate": 8.985863552550708e-06, "loss": 0.0029, "num_tokens": 6267201.0, "reward": 2.3704545497894287, "reward_std": 0.2715064585208893, "rewards/fixed_code_pass_all_test_reward/mean": 0.8371212482452393, "rewards/fixed_code_pass_all_test_reward/std": 0.23912116885185242, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5333333015441895, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.06424161791801453, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 192.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.13521490499907765, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.03167380881495774, "learning_rate": 8.998156115550093e-06, "loss": 0.0013, "num_tokens": 6274610.0, "reward": 2.4375, "reward_std": 0.5629958510398865, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.13539937280944475, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.024463605310302228, "learning_rate": 9.010448678549479e-06, "loss": 0.001, "num_tokens": 6288347.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.13558384061981185, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.07224431168287992, "learning_rate": 9.022741241548863e-06, "loss": 0.0029, "num_tokens": 6294524.0, "reward": 2.25, "reward_std": 0.30860668420791626, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.13576830843017892, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.035537042655050755, "learning_rate": 9.035033804548249e-06, "loss": 0.0014, "num_tokens": 6301766.0, "reward": 2.8074324131011963, "reward_std": 0.26576852798461914, "rewards/fixed_code_pass_all_test_reward/mean": 0.8074324131011963, "rewards/fixed_code_pass_all_test_reward/std": 0.26576849818229675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.13595277624054602, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.060564886312931776, "learning_rate": 9.047326367547635e-06, "loss": 0.0024, "num_tokens": 6309014.0, "reward": 2.5833334922790527, "reward_std": 0.49601584672927856, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 662.0, "completions/mean_terminated_length": 662.0, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.13613724405091313, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.016118745086714625, "learning_rate": 9.05961893054702e-06, "loss": 0.0006, "num_tokens": 6324550.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 344.125, "completions/mean_terminated_length": 344.125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.1363217118612802, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.018830742337740958, "learning_rate": 9.071911493546405e-06, "loss": 0.0008, "num_tokens": 6332455.0, "reward": 1.6796875, "reward_std": 0.3711671531200409, "rewards/fixed_code_pass_all_test_reward/mean": 0.6796875, "rewards/fixed_code_pass_all_test_reward/std": 0.3711671829223633, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 280.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.1365061796716473, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.03258543717674911, "learning_rate": 9.08420405654579e-06, "loss": 0.0013, "num_tokens": 6339071.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.1366906474820144, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.05367995984852314, "learning_rate": 9.096496619545176e-06, "loss": 0.0021, "num_tokens": 6347150.0, "reward": 1.8452380895614624, "reward_std": 0.287691593170166, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8452380895614624, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.28769156336784363, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 389.25, "completions/mean_terminated_length": 389.25, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.13687511529238147, "frac_reward_zero_std": 0.0, "grad_norm": 0.64453125, "kl": 0.017836741521023214, "learning_rate": 9.108789182544562e-06, "loss": 0.0007, "num_tokens": 6360344.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.13705958310274857, "frac_reward_zero_std": 0.0, "grad_norm": 4.5625, "kl": 0.05615188565570861, "learning_rate": 9.121081745543948e-06, "loss": 0.0022, "num_tokens": 6365645.0, "reward": 1.6778769493103027, "reward_std": 1.0434110164642334, "rewards/fixed_code_pass_all_test_reward/mean": 0.3035714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.3095892071723938, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6243055462837219, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.42794719338417053, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 391.375, "completions/mean_terminated_length": 391.375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.13724405091311567, "frac_reward_zero_std": 0.0, "grad_norm": 0.66015625, "kl": 0.034034705138765275, "learning_rate": 9.133374308543332e-06, "loss": 0.0014, "num_tokens": 6374152.0, "reward": 1.4255952835083008, "reward_std": 0.3780582547187805, "rewards/fixed_code_pass_all_test_reward/mean": 0.2589285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.07576145231723785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35634833574295044, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.13742851872348275, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.014536995731759816, "learning_rate": 9.145666871542718e-06, "loss": 0.0006, "num_tokens": 6378820.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.13761298653384985, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.029458162374794483, "learning_rate": 9.157959434542102e-06, "loss": 0.0012, "num_tokens": 6389864.0, "reward": 2.3958334922790527, "reward_std": 0.3630139231681824, "rewards/fixed_code_pass_all_test_reward/mean": 0.3958333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.3630139231681824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.13779745434421695, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.009407680248841643, "learning_rate": 9.170251997541488e-06, "loss": 0.0004, "num_tokens": 6394777.0, "reward": 2.2249999046325684, "reward_std": 0.459036260843277, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8500000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2070196568965912, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.13798192215458402, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.010079972445964813, "learning_rate": 9.182544560540874e-06, "loss": 0.0004, "num_tokens": 6400087.0, "reward": 2.8937501907348633, "reward_std": 0.15683819353580475, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8937499523162842, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15683817863464355, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.13816638996495112, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.028733404411468655, "learning_rate": 9.194837123540258e-06, "loss": 0.0011, "num_tokens": 6409113.0, "reward": 2.129166603088379, "reward_std": 0.14274799823760986, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1482035368680954, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.13835085777531822, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.08367111859843135, "learning_rate": 9.207129686539644e-06, "loss": 0.0033, "num_tokens": 6416201.0, "reward": 1.8333333730697632, "reward_std": 0.35634833574295044, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8333333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3563483655452728, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 553.375, "completions/mean_terminated_length": 553.375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.1385353255856853, "frac_reward_zero_std": 0.0, "grad_norm": 0.65234375, "kl": 0.028366627520881593, "learning_rate": 9.219422249539029e-06, "loss": 0.0011, "num_tokens": 6431052.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 468.375, "completions/mean_terminated_length": 468.375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.1387197933960524, "frac_reward_zero_std": 0.0, "grad_norm": 0.6796875, "kl": 0.019127504900097847, "learning_rate": 9.231714812538415e-06, "loss": 0.0008, "num_tokens": 6444871.0, "reward": 2.579545497894287, "reward_std": 0.3698734641075134, "rewards/fixed_code_pass_all_test_reward/mean": 0.7045454978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.3101966381072998, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.13890426120641947, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.07791966036893427, "learning_rate": 9.2440073755378e-06, "loss": 0.0031, "num_tokens": 6454014.0, "reward": 1.21875, "reward_std": 0.1404181867837906, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2187500149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1404181718826294, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.13908872901678657, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.06489795469678938, "learning_rate": 9.256299938537185e-06, "loss": 0.0026, "num_tokens": 6461312.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 405.25, "completions/mean_terminated_length": 405.25, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.13927319682715367, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.019394075556192547, "learning_rate": 9.268592501536571e-06, "loss": 0.0008, "num_tokens": 6469570.0, "reward": 1.390625, "reward_std": 0.3761144280433655, "rewards/fixed_code_pass_all_test_reward/mean": 0.390625, "rewards/fixed_code_pass_all_test_reward/std": 0.3761144280433655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 202.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.13945766463752074, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.013150964688975364, "learning_rate": 9.280885064535955e-06, "loss": 0.0005, "num_tokens": 6474102.0, "reward": 1.4803571701049805, "reward_std": 0.583641767501831, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10535714775323868, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1422307789325714, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.13964213244788784, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02315110620111227, "learning_rate": 9.293177627535341e-06, "loss": 0.0009, "num_tokens": 6478114.0, "reward": 2.410416603088379, "reward_std": 0.12877242267131805, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4104166626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12877243757247925, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 216.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.13982660025825494, "frac_reward_zero_std": 1.0, "grad_norm": 0.18359375, "kl": 0.05927673727273941, "learning_rate": 9.305470190534727e-06, "loss": 0.0024, "num_tokens": 6487538.0, "reward": 2.3974359035491943, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7307692170143127, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 266.375, "completions/mean_terminated_length": 266.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.140011068068622, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.029159063822589815, "learning_rate": 9.317762753534113e-06, "loss": 0.0012, "num_tokens": 6493605.0, "reward": 1.4469339847564697, "reward_std": 0.24183440208435059, "rewards/fixed_code_pass_all_test_reward/mean": 0.4469339847564697, "rewards/fixed_code_pass_all_test_reward/std": 0.24183443188667297, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 113.0, "completions/mean_terminated_length": 113.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.1401955358789891, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.032811262528412044, "learning_rate": 9.330055316533498e-06, "loss": 0.0013, "num_tokens": 6497325.0, "reward": 1.993749976158142, "reward_std": 0.40570706129074097, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24375000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10500850528478622, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 406.625, "completions/mean_terminated_length": 406.625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.14038000368935621, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.007911232591141015, "learning_rate": 9.342347879532884e-06, "loss": 0.0003, "num_tokens": 6505162.0, "reward": 1.98630952835083, "reward_std": 0.6287128329277039, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.07715168595314026, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.27797621488571167, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21279051899909973, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 407.0, "completions/mean_terminated_length": 407.0, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.1405644714997233, "frac_reward_zero_std": 0.0, "grad_norm": 0.5859375, "kl": 0.027444443898275495, "learning_rate": 9.35464044253227e-06, "loss": 0.0011, "num_tokens": 6517426.0, "reward": 2.0416667461395264, "reward_std": 0.4520675241947174, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.1407489393100904, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.035165514796972275, "learning_rate": 9.366933005531654e-06, "loss": 0.0014, "num_tokens": 6527509.0, "reward": 2.090909004211426, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.09090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.1409334071204575, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.019393951748497784, "learning_rate": 9.37922556853104e-06, "loss": 0.0008, "num_tokens": 6531848.0, "reward": 1.943750023841858, "reward_std": 0.5931445360183716, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4437499940395355, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1743326336145401, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 325.125, "completions/mean_terminated_length": 325.125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.14111787493082456, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.01932902174303308, "learning_rate": 9.391518131530424e-06, "loss": 0.0008, "num_tokens": 6542185.0, "reward": 1.5208333730697632, "reward_std": 0.350028395652771, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3958333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12400396913290024, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 438.875, "completions/mean_terminated_length": 438.875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.14130234274119166, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.014276023372076452, "learning_rate": 9.40381069452981e-06, "loss": 0.0006, "num_tokens": 6551416.0, "reward": 1.4444444179534912, "reward_std": 0.6849348545074463, "rewards/fixed_code_pass_all_test_reward/mean": 0.569444477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.4617179334163666, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.14148681055155876, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.026350645464845, "learning_rate": 9.416103257529196e-06, "loss": 0.0011, "num_tokens": 6560458.0, "reward": 2.9583332538604736, "reward_std": 0.07715174555778503, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07715168595314026, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.14167127836192583, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.044873229577206075, "learning_rate": 9.42839582052858e-06, "loss": 0.0018, "num_tokens": 6567703.0, "reward": 0.9895833730697632, "reward_std": 0.46384674310684204, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1145833358168602, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23961569368839264, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 101.5, "completions/mean_terminated_length": 101.5, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.14185574617229293, "frac_reward_zero_std": 1.0, "grad_norm": 0.2001953125, "kl": 0.06211558822542429, "learning_rate": 9.440688383527967e-06, "loss": 0.0025, "num_tokens": 6573267.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.14204021398266004, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.007828130415873602, "learning_rate": 9.452980946527351e-06, "loss": 0.0003, "num_tokens": 6579036.0, "reward": 2.4937500953674316, "reward_std": 0.3649241030216217, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4937500059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3649241626262665, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 508.75, "completions/mean_terminated_length": 508.75, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.1422246817930271, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.015954752161633223, "learning_rate": 9.465273509526737e-06, "loss": 0.0006, "num_tokens": 6589170.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.1424091496033942, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.04117575439158827, "learning_rate": 9.477566072526121e-06, "loss": 0.0016, "num_tokens": 6596364.0, "reward": 1.2916667461395264, "reward_std": 0.4154745042324066, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.1425936174137613, "frac_reward_zero_std": 0.0, "grad_norm": 0.474609375, "kl": 0.0229418691014871, "learning_rate": 9.489858635525507e-06, "loss": 0.0009, "num_tokens": 6601449.0, "reward": 2.9791665077209473, "reward_std": 0.058925628662109375, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9791666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.14277808522412838, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.06422516354359686, "learning_rate": 9.502151198524893e-06, "loss": 0.0026, "num_tokens": 6605377.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.14296255303449548, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.03226367675233632, "learning_rate": 9.51444376152428e-06, "loss": 0.0013, "num_tokens": 6613930.0, "reward": 2.1430554389953613, "reward_std": 0.8796833157539368, "rewards/fixed_code_pass_all_test_reward/mean": 0.8930555582046509, "rewards/fixed_code_pass_all_test_reward/std": 0.1099843680858612, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4898979663848877, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.14314702084486258, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.1295638761948794, "learning_rate": 9.526736324523664e-06, "loss": 0.0052, "num_tokens": 6621371.0, "reward": 2.0416667461395264, "reward_std": 0.4520675241947174, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.14333148865522966, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.06530210515484214, "learning_rate": 9.53902888752305e-06, "loss": 0.0026, "num_tokens": 6629215.0, "reward": 2.2083334922790527, "reward_std": 0.4886273145675659, "rewards/fixed_code_pass_all_test_reward/mean": 0.4583333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.4217938482761383, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.28171810507774353, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.14351595646559676, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.033528224332258105, "learning_rate": 9.551321450522436e-06, "loss": 0.0013, "num_tokens": 6638270.0, "reward": 1.6691092252731323, "reward_std": 0.2753939926624298, "rewards/fixed_code_pass_all_test_reward/mean": 0.6566091775894165, "rewards/fixed_code_pass_all_test_reward/std": 0.26649656891822815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.012500000186264515, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0353553406894207, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.14370042427596386, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.027097996091470122, "learning_rate": 9.56361401352182e-06, "loss": 0.0011, "num_tokens": 6645665.0, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.14388489208633093, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.020307356491684914, "learning_rate": 9.575906576521206e-06, "loss": 0.0008, "num_tokens": 6651187.0, "reward": 2.5291666984558105, "reward_std": 0.4964756667613983, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7791666984558105, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25130611658096313, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 143.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.14406935989669803, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.07881730003282428, "learning_rate": 9.58819913952059e-06, "loss": 0.0032, "num_tokens": 6659021.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 166.125, "completions/mean_terminated_length": 166.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.14425382770706513, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.058801956940442324, "learning_rate": 9.600491702519976e-06, "loss": 0.0024, "num_tokens": 6666126.0, "reward": 1.875, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 157.375, "completions/mean_terminated_length": 157.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.1444382955174322, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.0623785313218832, "learning_rate": 9.612784265519362e-06, "loss": 0.0025, "num_tokens": 6673345.0, "reward": 1.8250000476837158, "reward_std": 0.2592724859714508, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8250000476837158, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2592724859714508, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.1446227633277993, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.055598010309040546, "learning_rate": 9.625076828518746e-06, "loss": 0.0022, "num_tokens": 6680179.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1448072311381664, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.04405764816328883, "learning_rate": 9.637369391518132e-06, "loss": 0.0018, "num_tokens": 6689762.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.14499169894853348, "frac_reward_zero_std": 1.0, "grad_norm": 0.302734375, "kl": 0.07903205137699842, "learning_rate": 9.649661954517517e-06, "loss": 0.0032, "num_tokens": 6696470.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.14517616675890058, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.03640018787700683, "learning_rate": 9.661954517516903e-06, "loss": 0.0015, "num_tokens": 6702521.0, "reward": 1.8482142686843872, "reward_std": 0.609898030757904, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.5050762891769409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27885109186172485, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 106.5, "completions/mean_terminated_length": 106.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.14536063456926765, "frac_reward_zero_std": 1.0, "grad_norm": 0.71875, "kl": 0.06496857525780797, "learning_rate": 9.674247080516289e-06, "loss": 0.0026, "num_tokens": 6706237.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 385.625, "completions/mean_terminated_length": 385.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.14554510237963475, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.03401583875529468, "learning_rate": 9.686539643515673e-06, "loss": 0.0014, "num_tokens": 6715026.0, "reward": 0.9166666865348816, "reward_std": 0.38832157850265503, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.14572957019000185, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.009940272488165647, "learning_rate": 9.698832206515059e-06, "loss": 0.0004, "num_tokens": 6721594.0, "reward": 1.0729167461395264, "reward_std": 0.1368400603532791, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 128.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.14591403800036892, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.03730218921555206, "learning_rate": 9.711124769514445e-06, "loss": 0.0015, "num_tokens": 6725490.0, "reward": 2.0020833015441895, "reward_std": 0.6043780446052551, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25208333134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25314491987228394, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.14609850581073602, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.041704942006617785, "learning_rate": 9.723417332513831e-06, "loss": 0.0017, "num_tokens": 6734200.0, "reward": 1.9166667461395264, "reward_std": 1.0186748504638672, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5416666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.40335893630981445, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.14628297362110312, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.009101195493713021, "learning_rate": 9.735709895513215e-06, "loss": 0.0004, "num_tokens": 6741756.0, "reward": 1.7958333492279053, "reward_std": 0.5966673493385315, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4208333492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19835634529590607, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.1464674414314702, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.018873321125283837, "learning_rate": 9.748002458512601e-06, "loss": 0.0008, "num_tokens": 6747699.0, "reward": 2.1708333492279053, "reward_std": 0.5478493571281433, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2708333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39778652787208557, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.1466519092418373, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.031246848520822823, "learning_rate": 9.760295021511986e-06, "loss": 0.0012, "num_tokens": 6751667.0, "reward": 2.4375, "reward_std": 0.3925647437572479, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3925648629665375, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.1468363770522044, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.031488516135141253, "learning_rate": 9.772587584511372e-06, "loss": 0.0013, "num_tokens": 6757458.0, "reward": 1.8666666746139526, "reward_std": 0.23367321491241455, "rewards/fixed_code_pass_all_test_reward/mean": 0.8666666746139526, "rewards/fixed_code_pass_all_test_reward/std": 0.23367321491241455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.14702084486257147, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.03968311450444162, "learning_rate": 9.784880147510758e-06, "loss": 0.0016, "num_tokens": 6764285.0, "reward": 2.4708333015441895, "reward_std": 0.16201846301555634, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.47083336114883423, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1620185226202011, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.14720531267293857, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.0418827555840835, "learning_rate": 9.797172710510142e-06, "loss": 0.0017, "num_tokens": 6770739.0, "reward": 1.6022727489471436, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.14738978048330567, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.012552319909445941, "learning_rate": 9.809465273509528e-06, "loss": 0.0005, "num_tokens": 6774953.0, "reward": 1.9500000476837158, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20000000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 129.25, "completions/mean_terminated_length": 129.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.14757424829367274, "frac_reward_zero_std": 1.0, "grad_norm": 0.2177734375, "kl": 0.06563123175874352, "learning_rate": 9.821757836508912e-06, "loss": 0.0026, "num_tokens": 6781651.0, "reward": 1.6363636255264282, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 157.375, "completions/mean_terminated_length": 157.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.14775871610403984, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.015610270871547982, "learning_rate": 9.834050399508298e-06, "loss": 0.0006, "num_tokens": 6785766.0, "reward": 1.537500023841858, "reward_std": 0.48014140129089355, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2874999940395355, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09543135017156601, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.14794318391440694, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.052320127142593265, "learning_rate": 9.846342962507683e-06, "loss": 0.0021, "num_tokens": 6795837.0, "reward": 1.649999976158142, "reward_std": 0.4869731366634369, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6499999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4869731664657593, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.14812765172477402, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.01706112240208313, "learning_rate": 9.858635525507069e-06, "loss": 0.0007, "num_tokens": 6801744.0, "reward": 1.884615421295166, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8846153616905212, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 300.375, "completions/mean_terminated_length": 300.375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.14831211953514112, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.022386777796782553, "learning_rate": 9.870928088506455e-06, "loss": 0.0009, "num_tokens": 6808675.0, "reward": 1.575099229812622, "reward_std": 0.49090439081192017, "rewards/fixed_code_pass_all_test_reward/mean": 0.4563491940498352, "rewards/fixed_code_pass_all_test_reward/std": 0.051434461027383804, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24375000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24863716959953308, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 163.75, "completions/mean_terminated_length": 163.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.14849658734550822, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.09685088112019002, "learning_rate": 9.883220651505839e-06, "loss": 0.0039, "num_tokens": 6815841.0, "reward": 1.6410984992980957, "reward_std": 0.287739098072052, "rewards/fixed_code_pass_all_test_reward/mean": 0.0181818176060915, "rewards/fixed_code_pass_all_test_reward/std": 0.030732883140444756, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6229166984558105, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2718159854412079, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.1486810551558753, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.021541473106481135, "learning_rate": 9.895513214505225e-06, "loss": 0.0009, "num_tokens": 6819844.0, "reward": 1.4187500476837158, "reward_std": 0.3622501790523529, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.41874998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3622501790523529, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.1488655229662424, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.09903133846819401, "learning_rate": 9.907805777504611e-06, "loss": 0.004, "num_tokens": 6826072.0, "reward": 2.195833206176758, "reward_std": 0.7316946387290955, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5708333253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.42217785120010376, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.1490499907766095, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.0447469181381166, "learning_rate": 9.920098340503997e-06, "loss": 0.0018, "num_tokens": 6834406.0, "reward": 1.3229167461395264, "reward_std": 0.43742913007736206, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.14923445858697656, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.05388557864353061, "learning_rate": 9.932390903503381e-06, "loss": 0.0022, "num_tokens": 6842436.0, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.78125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.41052016615867615, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.14941892639734367, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.1258281311020255, "learning_rate": 9.944683466502767e-06, "loss": 0.005, "num_tokens": 6848612.0, "reward": 2.5208334922790527, "reward_std": 0.6751689910888672, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7708333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3666396141052246, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.14960339420771077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0162353515625, "kl": 0.0049185199168277904, "learning_rate": 9.956976029502152e-06, "loss": 0.0002, "num_tokens": 6855912.0, "reward": 1.9090909957885742, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.14978786201807784, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.017792601167457178, "learning_rate": 9.969268592501538e-06, "loss": 0.0007, "num_tokens": 6860747.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.14997232982844494, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.058546374551951885, "learning_rate": 9.981561155500924e-06, "loss": 0.0023, "num_tokens": 6868473.0, "reward": 1.75, "reward_std": 0.3450327515602112, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34503278136253357, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.15015679763881204, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.019116566400043666, "learning_rate": 9.993853718500308e-06, "loss": 0.0008, "num_tokens": 6875542.0, "reward": 1.6608695983886719, "reward_std": 0.09671908617019653, "rewards/fixed_code_pass_all_test_reward/mean": 0.6358695030212402, "rewards/fixed_code_pass_all_test_reward/std": 0.07685943692922592, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.1503412654491791, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.015339896141085774, "learning_rate": 1.0006146281499694e-05, "loss": 0.0006, "num_tokens": 6881751.0, "reward": 2.2857143878936768, "reward_std": 0.6290403008460999, "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 324.0, "completions/mean_terminated_length": 324.0, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.1505257332595462, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.020057305926457047, "learning_rate": 1.0018438844499078e-05, "loss": 0.0008, "num_tokens": 6889119.0, "reward": 1.513157844543457, "reward_std": 0.5208361148834229, "rewards/fixed_code_pass_all_test_reward/mean": 0.5131579041481018, "rewards/fixed_code_pass_all_test_reward/std": 0.5208361148834229, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 157.875, "completions/mean_terminated_length": 157.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.1507102010699133, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.031706904410384595, "learning_rate": 1.0030731407498464e-05, "loss": 0.0013, "num_tokens": 6897390.0, "reward": 1.7408536672592163, "reward_std": 0.27807241678237915, "rewards/fixed_code_pass_all_test_reward/mean": 0.7408536672592163, "rewards/fixed_code_pass_all_test_reward/std": 0.2780724763870239, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 189.625, "completions/mean_terminated_length": 189.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.15089466888028039, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.021776822861284018, "learning_rate": 1.004302397049785e-05, "loss": 0.0009, "num_tokens": 6902595.0, "reward": 2.5999999046325684, "reward_std": 0.3023715913295746, "rewards/fixed_code_pass_all_test_reward/mean": 0.9249999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.1035098284482956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.675000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2712405323982239, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 334.375, "completions/mean_terminated_length": 334.375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.1510791366906475, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.01570560608524829, "learning_rate": 1.0055316533497234e-05, "loss": 0.0006, "num_tokens": 6910014.0, "reward": 0.929411768913269, "reward_std": 0.3883116841316223, "rewards/fixed_code_pass_all_test_reward/mean": 0.029411764815449715, "rewards/fixed_code_pass_all_test_reward/std": 0.08318904042243958, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.15126360450101456, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.02714967983774841, "learning_rate": 1.006760909649662e-05, "loss": 0.0011, "num_tokens": 6917842.0, "reward": 2.0455358028411865, "reward_std": 0.11319231986999512, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08124999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11319231986999512, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.15144807231138166, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.042090835981070995, "learning_rate": 1.0079901659496005e-05, "loss": 0.0017, "num_tokens": 6925088.0, "reward": 2.5833334922790527, "reward_std": 0.7292091846466064, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8333333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35634833574295044, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 530.875, "completions/mean_terminated_length": 530.875, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.15163254012174876, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.00559491608873941, "learning_rate": 1.009219422249539e-05, "loss": 0.0002, "num_tokens": 6935711.0, "reward": 1.5729167461395264, "reward_std": 0.47442010045051575, "rewards/fixed_code_pass_all_test_reward/mean": 0.5729166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.47442010045051575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 114.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.15181700793211583, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.06206970755010843, "learning_rate": 1.0104486785494777e-05, "loss": 0.0025, "num_tokens": 6942008.0, "reward": 1.7592592239379883, "reward_std": 0.24084246158599854, "rewards/fixed_code_pass_all_test_reward/mean": 0.7592592239379883, "rewards/fixed_code_pass_all_test_reward/std": 0.24084247648715973, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.15200147574248293, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.029375761398114264, "learning_rate": 1.0116779348494161e-05, "loss": 0.0012, "num_tokens": 6952507.0, "reward": 2.3958334922790527, "reward_std": 0.5716691017150879, "rewards/fixed_code_pass_all_test_reward/mean": 0.6499999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.4869731664657593, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7458333373069763, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23430918157100677, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.15218594355285003, "frac_reward_zero_std": 1.0, "grad_norm": 0.12109375, "kl": 0.030395310604944825, "learning_rate": 1.0129071911493547e-05, "loss": 0.0012, "num_tokens": 6958010.0, "reward": 1.0860215425491333, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.08602150529623032, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.1523704113632171, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.032178052701056004, "learning_rate": 1.0141364474492931e-05, "loss": 0.0013, "num_tokens": 6969515.0, "reward": 2.032440662384033, "reward_std": 0.6982530951499939, "rewards/fixed_code_pass_all_test_reward/mean": 0.5178571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5145833492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.415134072303772, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 401.0, "completions/mean_terminated_length": 401.0, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.1525548791735842, "frac_reward_zero_std": 0.0, "grad_norm": 0.62890625, "kl": 0.01991586748044938, "learning_rate": 1.0153657037492317e-05, "loss": 0.0008, "num_tokens": 6982011.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.1527393469839513, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.028911867761053145, "learning_rate": 1.0165949600491702e-05, "loss": 0.0012, "num_tokens": 6986721.0, "reward": 2.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 177.5, "completions/mean_terminated_length": 177.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.15292381479431838, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.08414319856092334, "learning_rate": 1.0178242163491088e-05, "loss": 0.0034, "num_tokens": 6995677.0, "reward": 2.1875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.15310828260468548, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.05211417842656374, "learning_rate": 1.0190534726490474e-05, "loss": 0.0021, "num_tokens": 7003362.0, "reward": 1.75, "reward_std": 0.3450327515602112, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34503278136253357, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.15329275041505258, "frac_reward_zero_std": 1.0, "grad_norm": 0.2119140625, "kl": 0.034726565005257726, "learning_rate": 1.0202827289489861e-05, "loss": 0.0014, "num_tokens": 7012162.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.15347721822541965, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.07548800902441144, "learning_rate": 1.0215119852489246e-05, "loss": 0.003, "num_tokens": 7018682.0, "reward": 2.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.15366168603578675, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.0939127376768738, "learning_rate": 1.0227412415488632e-05, "loss": 0.0038, "num_tokens": 7026628.0, "reward": 2.90625, "reward_std": 0.18600594997406006, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18600596487522125, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 467.0, "completions/mean_terminated_length": 467.0, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.15384615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.02057651965878904, "learning_rate": 1.0239704978488016e-05, "loss": 0.0008, "num_tokens": 7041180.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 127.125, "completions/mean_terminated_length": 127.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.15403062165652093, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.051656093215569854, "learning_rate": 1.0251997541487402e-05, "loss": 0.0021, "num_tokens": 7045021.0, "reward": 1.2916667461395264, "reward_std": 0.4520675837993622, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.15421508946688803, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.021067991387099028, "learning_rate": 1.0264290104486786e-05, "loss": 0.0008, "num_tokens": 7054334.0, "reward": 1.6504629850387573, "reward_std": 0.058925606310367584, "rewards/fixed_code_pass_all_test_reward/mean": 0.29629629850387573, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3541666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.058925557881593704, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.15439955727725513, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.03025681700091809, "learning_rate": 1.0276582667486172e-05, "loss": 0.0012, "num_tokens": 7077092.0, "reward": 1.3358433246612549, "reward_std": 0.40860897302627563, "rewards/fixed_code_pass_all_test_reward/mean": 0.33584338426589966, "rewards/fixed_code_pass_all_test_reward/std": 0.40860897302627563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 73.875, "completions/mean_terminated_length": 73.875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.1545840250876222, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.051462715957313776, "learning_rate": 1.0288875230485558e-05, "loss": 0.0021, "num_tokens": 7080563.0, "reward": 2.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1547684928979893, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.022763887303881347, "learning_rate": 1.0301167793484943e-05, "loss": 0.0009, "num_tokens": 7087537.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.1549529607083564, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.05615383991971612, "learning_rate": 1.0313460356484329e-05, "loss": 0.0022, "num_tokens": 7096523.0, "reward": 2.240530252456665, "reward_std": 0.12065824866294861, "rewards/fixed_code_pass_all_test_reward/mean": 0.8863636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.09559360146522522, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3541666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.058925557881593704, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 211.625, "completions/mean_terminated_length": 211.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.15513742851872347, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.029096035286784172, "learning_rate": 1.0325752919483713e-05, "loss": 0.0012, "num_tokens": 7102128.0, "reward": 2.4479167461395264, "reward_std": 0.9686940312385559, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4714045524597168, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.15532189632909058, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.09433964965865016, "learning_rate": 1.0338045482483099e-05, "loss": 0.0038, "num_tokens": 7108360.0, "reward": 2.375, "reward_std": 0.2635231912136078, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3421454131603241, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 140.75, "completions/mean_terminated_length": 140.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.15550636413945768, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.056759399361908436, "learning_rate": 1.0350338045482485e-05, "loss": 0.0023, "num_tokens": 7115302.0, "reward": 2.3125, "reward_std": 0.7039429545402527, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4955156147480011, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 109.75, "completions/mean_terminated_length": 109.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.15569083194982475, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.06178180454298854, "learning_rate": 1.036263060848187e-05, "loss": 0.0025, "num_tokens": 7122524.0, "reward": 2.4193549156188965, "reward_std": 0.4925489127635956, "rewards/fixed_code_pass_all_test_reward/mean": 0.4193548262119293, "rewards/fixed_code_pass_all_test_reward/std": 0.4925489127635956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 92.875, "completions/mean_terminated_length": 92.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.15587529976019185, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.05930089112371206, "learning_rate": 1.0374923171481255e-05, "loss": 0.0024, "num_tokens": 7126163.0, "reward": 1.1666667461395264, "reward_std": 0.3563483655452728, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35634833574295044, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.15605976757055895, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.03435019263997674, "learning_rate": 1.038721573448064e-05, "loss": 0.0014, "num_tokens": 7130765.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 117.625, "completions/mean_terminated_length": 117.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.15624423538092602, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0490859467536211, "learning_rate": 1.0399508297480025e-05, "loss": 0.002, "num_tokens": 7138570.0, "reward": 2.463235378265381, "reward_std": 0.4495700001716614, "rewards/fixed_code_pass_all_test_reward/mean": 0.4632353186607361, "rewards/fixed_code_pass_all_test_reward/std": 0.44957005977630615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 143.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.15642870319129312, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.057721138931810856, "learning_rate": 1.0411800860479411e-05, "loss": 0.0023, "num_tokens": 7147684.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.15661317100166022, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.04315559286624193, "learning_rate": 1.0424093423478796e-05, "loss": 0.0017, "num_tokens": 7152977.0, "reward": 1.0446429252624512, "reward_std": 0.12626908719539642, "rewards/fixed_code_pass_all_test_reward/mean": 0.0446428582072258, "rewards/fixed_code_pass_all_test_reward/std": 0.12626907229423523, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1567976388120273, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.04804028617218137, "learning_rate": 1.0436385986478182e-05, "loss": 0.0019, "num_tokens": 7163380.0, "reward": 2.2386364936828613, "reward_std": 0.15885880589485168, "rewards/fixed_code_pass_all_test_reward/mean": 0.23863637447357178, "rewards/fixed_code_pass_all_test_reward/std": 0.15885883569717407, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 246.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.1569821066223944, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.022416856372728944, "learning_rate": 1.0448678549477566e-05, "loss": 0.0009, "num_tokens": 7170052.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.1571665744327615, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.055279282154515386, "learning_rate": 1.0460971112476952e-05, "loss": 0.0022, "num_tokens": 7175646.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.15735104224312857, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.04112656507641077, "learning_rate": 1.0473263675476338e-05, "loss": 0.0016, "num_tokens": 7186079.0, "reward": 2.65625, "reward_std": 0.48065245151519775, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.15753551005349567, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.06199896661564708, "learning_rate": 1.0485556238475722e-05, "loss": 0.0025, "num_tokens": 7193760.0, "reward": 2.1666667461395264, "reward_std": 0.5634361505508423, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 134.5, "completions/mean_terminated_length": 134.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.15771997786386274, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.04310403345152736, "learning_rate": 1.0497848801475108e-05, "loss": 0.0017, "num_tokens": 7201804.0, "reward": 1.557926893234253, "reward_std": 0.18837234377861023, "rewards/fixed_code_pass_all_test_reward/mean": 0.30792683362960815, "rewards/fixed_code_pass_all_test_reward/std": 0.08030018210411072, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15430335700511932, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.15790444567422984, "frac_reward_zero_std": 0.0, "grad_norm": 3.328125, "kl": 0.0648380839265883, "learning_rate": 1.0510141364474493e-05, "loss": 0.0026, "num_tokens": 7209325.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.15808891348459694, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.04895312897861004, "learning_rate": 1.0522433927473879e-05, "loss": 0.002, "num_tokens": 7214543.0, "reward": 1.2049319744110107, "reward_std": 0.6401836276054382, "rewards/fixed_code_pass_all_test_reward/mean": 0.16326530277729034, "rewards/fixed_code_pass_all_test_reward/std": 0.35465607047080994, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35634833574295044, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.15827338129496402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.024122594390064478, "learning_rate": 1.0534726490473263e-05, "loss": 0.001, "num_tokens": 7223728.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.15845784910533112, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.09691812051460147, "learning_rate": 1.0547019053472649e-05, "loss": 0.0039, "num_tokens": 7231456.0, "reward": 2.4124999046325684, "reward_std": 0.4290770888328552, "rewards/fixed_code_pass_all_test_reward/mean": 0.4750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.4773438572883606, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 385.375, "completions/mean_terminated_length": 385.375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.15864231691569822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.019538147724233568, "learning_rate": 1.0559311616472035e-05, "loss": 0.0008, "num_tokens": 7240283.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1588267847260653, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.0525142434053123, "learning_rate": 1.057160417947142e-05, "loss": 0.0021, "num_tokens": 7248399.0, "reward": 2.6354167461395264, "reward_std": 0.4541202187538147, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8854166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23961567878723145, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.1590112525364324, "frac_reward_zero_std": 1.0, "grad_norm": 0.03369140625, "kl": 0.03736470965668559, "learning_rate": 1.0583896742470805e-05, "loss": 0.0015, "num_tokens": 7256250.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.1591957203467995, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.01206398569047451, "learning_rate": 1.059618930547019e-05, "loss": 0.0005, "num_tokens": 7261453.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, "rewards/fixed_code_pass_all_test_reward/std": 0.09300298243761063, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.15938018815716656, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.05470752762630582, "learning_rate": 1.0608481868469577e-05, "loss": 0.0022, "num_tokens": 7270708.0, "reward": 2.3831753730773926, "reward_std": 0.21314705908298492, "rewards/fixed_code_pass_all_test_reward/mean": 0.7373417615890503, "rewards/fixed_code_pass_all_test_reward/std": 0.23338167369365692, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6458333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.15956465596753366, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.03504409687593579, "learning_rate": 1.0620774431468963e-05, "loss": 0.0014, "num_tokens": 7279531.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 135.25, "completions/mean_terminated_length": 135.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.15974912377790076, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.14226820971816778, "learning_rate": 1.0633066994468348e-05, "loss": 0.0057, "num_tokens": 7286597.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 324.75, "completions/mean_terminated_length": 324.75, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.15993359158826784, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109375, "kl": 0.01613944466225803, "learning_rate": 1.0645359557467734e-05, "loss": 0.0006, "num_tokens": 7296547.0, "reward": 2.3684210777282715, "reward_std": 0.38981905579566956, "rewards/fixed_code_pass_all_test_reward/mean": 0.3684210479259491, "rewards/fixed_code_pass_all_test_reward/std": 0.3898189961910248, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.16011805939863494, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.019958195625804365, "learning_rate": 1.065765212046712e-05, "loss": 0.0008, "num_tokens": 7301964.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.16030252720900204, "frac_reward_zero_std": 1.0, "grad_norm": 0.3125, "kl": 0.06158966664224863, "learning_rate": 1.0669944683466504e-05, "loss": 0.0025, "num_tokens": 7308752.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.1604869950193691, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.03502204339019954, "learning_rate": 1.068223724646589e-05, "loss": 0.0014, "num_tokens": 7314876.0, "reward": 2.3090710639953613, "reward_std": 0.38082650303840637, "rewards/fixed_code_pass_all_test_reward/mean": 0.9474637508392334, "rewards/fixed_code_pass_all_test_reward/std": 0.05636357143521309, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3616071343421936, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38486185669898987, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1606714628297362, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.022124334354884923, "learning_rate": 1.0694529809465274e-05, "loss": 0.0009, "num_tokens": 7320779.0, "reward": 1.9029254913330078, "reward_std": 0.24035075306892395, "rewards/fixed_code_pass_all_test_reward/mean": 0.7779255509376526, "rewards/fixed_code_pass_all_test_reward/std": 0.02632846124470234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24800795316696167, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 108.75, "completions/mean_terminated_length": 108.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1608559306401033, "frac_reward_zero_std": 1.0, "grad_norm": 0.4609375, "kl": 0.07639656635001302, "learning_rate": 1.070682237246466e-05, "loss": 0.0031, "num_tokens": 7327033.0, "reward": 1.814814805984497, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8148148059844971, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 361.75, "completions/mean_terminated_length": 361.75, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.16104039845047038, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.023313617799431086, "learning_rate": 1.0719114935464046e-05, "loss": 0.0009, "num_tokens": 7335327.0, "reward": 1.0416667461395264, "reward_std": 0.11785116046667099, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.16122486626083748, "frac_reward_zero_std": 1.0, "grad_norm": 0.058837890625, "kl": 0.029607139760628343, "learning_rate": 1.073140749846343e-05, "loss": 0.0012, "num_tokens": 7342611.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.16140933407120459, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.0274662293959409, "learning_rate": 1.0743700061462817e-05, "loss": 0.0011, "num_tokens": 7348900.0, "reward": 1.1915322542190552, "reward_std": 0.3339513838291168, "rewards/fixed_code_pass_all_test_reward/mean": 0.12903225421905518, "rewards/fixed_code_pass_all_test_reward/std": 0.3141762316226959, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.16159380188157166, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.10371031053364277, "learning_rate": 1.07559926244622e-05, "loss": 0.0041, "num_tokens": 7357554.0, "reward": 2.261904716491699, "reward_std": 0.2693740129470825, "rewards/fixed_code_pass_all_test_reward/mean": 0.8244047164916992, "rewards/fixed_code_pass_all_test_reward/std": 0.09259732067584991, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.16177826969193876, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.04241654323413968, "learning_rate": 1.0768285187461587e-05, "loss": 0.0017, "num_tokens": 7363460.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.16196273750230586, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.08334874222055078, "learning_rate": 1.0780577750460973e-05, "loss": 0.0033, "num_tokens": 7372519.0, "reward": 2.741666793823242, "reward_std": 0.39107102155685425, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.949999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213478565216, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.16214720531267293, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.09584637288935483, "learning_rate": 1.0792870313460357e-05, "loss": 0.0038, "num_tokens": 7378809.0, "reward": 2.3958332538604736, "reward_std": 0.7178975939750671, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6458333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.49149513244628906, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.16233167312304003, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.05286903399974108, "learning_rate": 1.0805162876459743e-05, "loss": 0.0021, "num_tokens": 7386962.0, "reward": 2.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 90.375, "completions/mean_terminated_length": 90.375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.16251614093340713, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.11456815153360367, "learning_rate": 1.0817455439459127e-05, "loss": 0.0046, "num_tokens": 7392781.0, "reward": 2.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.1627006087437742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.028504335205070674, "learning_rate": 1.0829748002458513e-05, "loss": 0.0011, "num_tokens": 7398319.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 185.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1628850765541413, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.020676917280070484, "learning_rate": 1.08420405654579e-05, "loss": 0.0008, "num_tokens": 7403225.0, "reward": 2.875, "reward_std": 0.24800798296928406, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.1630695443645084, "frac_reward_zero_std": 0.0, "grad_norm": 4.125, "kl": 0.06285860855132341, "learning_rate": 1.0854333128457284e-05, "loss": 0.0025, "num_tokens": 7406509.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.16325401217487548, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.03811750351451337, "learning_rate": 1.086662569145667e-05, "loss": 0.0015, "num_tokens": 7412669.0, "reward": 2.0520832538604736, "reward_std": 0.7623811960220337, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5520833134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.48578503727912903, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.16343847998524258, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.028583250008523464, "learning_rate": 1.0878918254456054e-05, "loss": 0.0011, "num_tokens": 7419084.0, "reward": 1.6666667461395264, "reward_std": 0.414502888917923, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3725219666957855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 190.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.16362294779560965, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.05469793174415827, "learning_rate": 1.089121081745544e-05, "loss": 0.0022, "num_tokens": 7428691.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 242.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.16380741560597675, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.029494771268218756, "learning_rate": 1.0903503380454824e-05, "loss": 0.0012, "num_tokens": 7439126.0, "reward": 2.6031746864318848, "reward_std": 0.6122952699661255, "rewards/fixed_code_pass_all_test_reward/mean": 0.7222222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.44839513301849365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8809523582458496, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2428104430437088, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.16399188341634385, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.040938789723441005, "learning_rate": 1.091579594345421e-05, "loss": 0.0016, "num_tokens": 7447072.0, "reward": 2.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.16417635122671093, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.03635703632608056, "learning_rate": 1.0928088506453596e-05, "loss": 0.0015, "num_tokens": 7455888.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 320.5, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.16436081903707803, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.02539125387556851, "learning_rate": 1.094038106945298e-05, "loss": 0.001, "num_tokens": 7466564.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 437.25, "completions/mean_terminated_length": 437.25, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.16454528684744513, "frac_reward_zero_std": 0.0, "grad_norm": 0.5546875, "kl": 0.006356904661515728, "learning_rate": 1.0952673632452367e-05, "loss": 0.0003, "num_tokens": 7475102.0, "reward": 2.032451629638672, "reward_std": 0.08381511270999908, "rewards/fixed_code_pass_all_test_reward/mean": 0.9230769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.07121692597866058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.109375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 126.75, "completions/mean_terminated_length": 126.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1647297546578122, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.05862652510404587, "learning_rate": 1.0964966195451751e-05, "loss": 0.0023, "num_tokens": 7481748.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1649142224681793, "frac_reward_zero_std": 1.0, "grad_norm": 0.041259765625, "kl": 0.017446873243898153, "learning_rate": 1.0977258758451137e-05, "loss": 0.0007, "num_tokens": 7487861.0, "reward": 1.399999976158142, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.1650986902785464, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.051434525521472096, "learning_rate": 1.0989551321450523e-05, "loss": 0.0021, "num_tokens": 7494744.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 570.75, "completions/mean_terminated_length": 570.75, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.16528315808891347, "frac_reward_zero_std": 0.0, "grad_norm": 0.62890625, "kl": 0.01439899654360488, "learning_rate": 1.1001843884449909e-05, "loss": 0.0006, "num_tokens": 7510782.0, "reward": 2.75, "reward_std": 0.18898223340511322, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.18898223340511322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.16546762589928057, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.05019568919669837, "learning_rate": 1.1014136447449295e-05, "loss": 0.002, "num_tokens": 7518579.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 374.0, "completions/mean_terminated_length": 374.0, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.16565209370964767, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.029953620629385114, "learning_rate": 1.1026429010448681e-05, "loss": 0.0012, "num_tokens": 7527251.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.16583656152001475, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.04250514158047736, "learning_rate": 1.1038721573448065e-05, "loss": 0.0017, "num_tokens": 7535007.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 575.375, "completions/mean_terminated_length": 575.375, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.16602102933038185, "frac_reward_zero_std": 0.0, "grad_norm": 0.703125, "kl": 0.01297433685977012, "learning_rate": 1.1051014136447451e-05, "loss": 0.0005, "num_tokens": 7550698.0, "reward": 1.8032407760620117, "reward_std": 0.3254411816596985, "rewards/fixed_code_pass_all_test_reward/mean": 0.11574073135852814, "rewards/fixed_code_pass_all_test_reward/std": 0.09584243595600128, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25877460837364197, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 207.125, "completions/mean_terminated_length": 207.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.16620549714074895, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.029172983835451305, "learning_rate": 1.1063306699446836e-05, "loss": 0.0012, "num_tokens": 7555859.0, "reward": 2.3482143878936768, "reward_std": 0.2804575264453888, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4107142984867096, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23868975043296814, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 82.375, "completions/mean_terminated_length": 82.375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.16638996495111602, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.06509940465912223, "learning_rate": 1.1075599262446222e-05, "loss": 0.0026, "num_tokens": 7559374.0, "reward": 1.625, "reward_std": 1.0606601238250732, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.16657443276148312, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.016325444099493325, "learning_rate": 1.1087891825445608e-05, "loss": 0.0007, "num_tokens": 7565199.0, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.16675890057185022, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0353142861276865, "learning_rate": 1.1100184388444992e-05, "loss": 0.0014, "num_tokens": 7574343.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 456.125, "completions/mean_terminated_length": 456.125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.1669433683822173, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.018130484269931912, "learning_rate": 1.1112476951444378e-05, "loss": 0.0007, "num_tokens": 7583896.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 235.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.1671278361925844, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.02317343687172979, "learning_rate": 1.1124769514443762e-05, "loss": 0.0009, "num_tokens": 7590631.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 118.625, "completions/mean_terminated_length": 118.625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.1673123040029515, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.09297320153564215, "learning_rate": 1.1137062077443148e-05, "loss": 0.0037, "num_tokens": 7597860.0, "reward": 2.4459457397460938, "reward_std": 0.34520912170410156, "rewards/fixed_code_pass_all_test_reward/mean": 0.44594594836235046, "rewards/fixed_code_pass_all_test_reward/std": 0.34520915150642395, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 388.5, "completions/mean_terminated_length": 388.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.16749677181331857, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.02891680976608768, "learning_rate": 1.1149354640442534e-05, "loss": 0.0012, "num_tokens": 7606312.0, "reward": 2.9124999046325684, "reward_std": 0.12174325436353683, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9125000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12174329161643982, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 216.875, "completions/mean_terminated_length": 216.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.16768123962368567, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.019668662920594215, "learning_rate": 1.1161647203441918e-05, "loss": 0.0008, "num_tokens": 7612319.0, "reward": 1.402083396911621, "reward_std": 0.2308434396982193, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.09708039462566376, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21458333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23255610466003418, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 65.5, "completions/mean_terminated_length": 65.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.16786570743405277, "frac_reward_zero_std": 0.0, "grad_norm": 3.40625, "kl": 0.08162572514265776, "learning_rate": 1.1173939766441304e-05, "loss": 0.0033, "num_tokens": 7615683.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.16805017524441984, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.022961496375501156, "learning_rate": 1.1186232329440689e-05, "loss": 0.0009, "num_tokens": 7622666.0, "reward": 2.128378391265869, "reward_std": 0.3523152768611908, "rewards/fixed_code_pass_all_test_reward/mean": 0.12837837636470795, "rewards/fixed_code_pass_all_test_reward/std": 0.3523152768611908, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 141.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.16823464305478694, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.040577440056949854, "learning_rate": 1.1198524892440075e-05, "loss": 0.0016, "num_tokens": 7629861.0, "reward": 2.53125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.53125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 136.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.16841911086515404, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.05516999075189233, "learning_rate": 1.121081745543946e-05, "loss": 0.0022, "num_tokens": 7637220.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.16860357867552112, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.04083979350980371, "learning_rate": 1.1223110018438845e-05, "loss": 0.0016, "num_tokens": 7645331.0, "reward": 1.8125, "reward_std": 0.035355329513549805, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.035355325788259506, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.16878804648588822, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.016348825534805655, "learning_rate": 1.1235402581438231e-05, "loss": 0.0007, "num_tokens": 7652787.0, "reward": 1.8208333253860474, "reward_std": 0.19915495812892914, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.21380898356437683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02083333395421505, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 110.625, "completions/mean_terminated_length": 110.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.16897251429625532, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.05909560131840408, "learning_rate": 1.1247695144437615e-05, "loss": 0.0024, "num_tokens": 7656552.0, "reward": 1.4166667461395264, "reward_std": 0.9553525447845459, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.1691569821066224, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.060155365616083145, "learning_rate": 1.1259987707437001e-05, "loss": 0.0024, "num_tokens": 7665179.0, "reward": 2.3257575035095215, "reward_std": 0.4648895859718323, "rewards/fixed_code_pass_all_test_reward/mean": 0.40909093618392944, "rewards/fixed_code_pass_all_test_reward/std": 0.3668687641620636, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.1693414499169895, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.06170762749388814, "learning_rate": 1.1272280270436386e-05, "loss": 0.0025, "num_tokens": 7671706.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1695259177273566, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.048168299021199346, "learning_rate": 1.1284572833435772e-05, "loss": 0.0019, "num_tokens": 7675874.0, "reward": 2.0416667461395264, "reward_std": 0.8409650325775146, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39198318123817444, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 722.625, "completions/mean_terminated_length": 722.625, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.16971038553772366, "frac_reward_zero_std": 0.0, "grad_norm": 0.435546875, "kl": 0.01591713697416708, "learning_rate": 1.1296865396435158e-05, "loss": 0.0006, "num_tokens": 7694335.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 75.5, "completions/mean_terminated_length": 75.5, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.16989485334809076, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.07138880901038647, "learning_rate": 1.1309157959434542e-05, "loss": 0.0029, "num_tokens": 7702099.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 118.875, "completions/mean_terminated_length": 118.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.17007932115845784, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.052153136348351836, "learning_rate": 1.1321450522433928e-05, "loss": 0.0021, "num_tokens": 7710034.0, "reward": 2.555833339691162, "reward_std": 0.16418872773647308, "rewards/fixed_code_pass_all_test_reward/mean": 0.8474999666213989, "rewards/fixed_code_pass_all_test_reward/std": 0.132638081908226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7083333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19416078925132751, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 94.0, "completions/mean_terminated_length": 94.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.17026378896882494, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.05882852151989937, "learning_rate": 1.1333743085433312e-05, "loss": 0.0024, "num_tokens": 7713850.0, "reward": 2.3333334922790527, "reward_std": 0.7559288740158081, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8333333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.30860668420791626, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 72.25, "completions/mean_terminated_length": 72.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.17044825677919204, "frac_reward_zero_std": 0.0, "grad_norm": 3.765625, "kl": 0.07102454407140613, "learning_rate": 1.1346035648432698e-05, "loss": 0.0028, "num_tokens": 7717436.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1706327245895591, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.07784443022683263, "learning_rate": 1.1358328211432084e-05, "loss": 0.0031, "num_tokens": 7723561.0, "reward": 2.5625, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1708171923999262, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.02916593663394451, "learning_rate": 1.1370620774431469e-05, "loss": 0.0012, "num_tokens": 7729511.0, "reward": 1.7132352590560913, "reward_std": 0.18618230521678925, "rewards/fixed_code_pass_all_test_reward/mean": 0.7132353186607361, "rewards/fixed_code_pass_all_test_reward/std": 0.18618233501911163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.1710016602102933, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.06371189188212156, "learning_rate": 1.1382913337430855e-05, "loss": 0.0025, "num_tokens": 7737257.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.17118612802066038, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.0555245541036129, "learning_rate": 1.1395205900430239e-05, "loss": 0.0022, "num_tokens": 7746097.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.17137059583102748, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.03709913860075176, "learning_rate": 1.1407498463429627e-05, "loss": 0.0015, "num_tokens": 7753155.0, "reward": 0.96875, "reward_std": 0.2086307406425476, "rewards/fixed_code_pass_all_test_reward/mean": 0.09375, "rewards/fixed_code_pass_all_test_reward/std": 0.18600596487522125, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 230.75, "completions/mean_terminated_length": 230.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.17155506364139458, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.044663689797744155, "learning_rate": 1.1419791026429013e-05, "loss": 0.0018, "num_tokens": 7762753.0, "reward": 2.375, "reward_std": 0.46041733026504517, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.46041733026504517, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 114.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.17173953145176166, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.10006767604500055, "learning_rate": 1.1432083589428397e-05, "loss": 0.004, "num_tokens": 7769130.0, "reward": 2.3125, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 65.125, "completions/mean_terminated_length": 65.125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.17192399926212876, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.07786376588046551, "learning_rate": 1.1444376152427783e-05, "loss": 0.0031, "num_tokens": 7772491.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 200.875, "completions/mean_terminated_length": 200.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.17210846707249586, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.04157975944690406, "learning_rate": 1.1456668715427169e-05, "loss": 0.0017, "num_tokens": 7778186.0, "reward": 1.5386029481887817, "reward_std": 0.19807173311710358, "rewards/fixed_code_pass_all_test_reward/mean": 0.47610294818878174, "rewards/fixed_code_pass_all_test_reward/std": 0.067591093480587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.17229293488286293, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.037722833920270205, "learning_rate": 1.1468961278426553e-05, "loss": 0.0015, "num_tokens": 7789339.0, "reward": 2.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 116.375, "completions/mean_terminated_length": 116.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.17247740269323003, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.06581647275015712, "learning_rate": 1.148125384142594e-05, "loss": 0.0026, "num_tokens": 7796286.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.17266187050359713, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.04680960066616535, "learning_rate": 1.1493546404425324e-05, "loss": 0.0019, "num_tokens": 7804532.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 150.375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.1728463383139642, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.03866378637030721, "learning_rate": 1.150583896742471e-05, "loss": 0.0015, "num_tokens": 7812823.0, "reward": 2.043919086456299, "reward_std": 0.8742179870605469, "rewards/fixed_code_pass_all_test_reward/mean": 0.2939189076423645, "rewards/fixed_code_pass_all_test_reward/std": 0.31032782793045044, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.1730308061243313, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.1129335502628237, "learning_rate": 1.1518131530424096e-05, "loss": 0.0045, "num_tokens": 7824142.0, "reward": 1.413690447807312, "reward_std": 0.24708786606788635, "rewards/fixed_code_pass_all_test_reward/mean": 0.4136904776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.24708783626556396, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 131.25, "completions/mean_terminated_length": 131.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1732152739346984, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.04665721021592617, "learning_rate": 1.153042409342348e-05, "loss": 0.0019, "num_tokens": 7831744.0, "reward": 2.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.17339974174506548, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.060611390275880694, "learning_rate": 1.1542716656422866e-05, "loss": 0.0024, "num_tokens": 7841014.0, "reward": 2.41011905670166, "reward_std": 0.6771308779716492, "rewards/fixed_code_pass_all_test_reward/mean": 0.6726190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.4668731391429901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.737500011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3889087438583374, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 115.875, "completions/mean_terminated_length": 115.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.17358420955543258, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.06885682651773095, "learning_rate": 1.155500921942225e-05, "loss": 0.0028, "num_tokens": 7845069.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.17376867736579968, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.03956237155944109, "learning_rate": 1.1567301782421636e-05, "loss": 0.0016, "num_tokens": 7854981.0, "reward": 1.8541667461395264, "reward_std": 0.3500283360481262, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.35634833574295044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.17395314517616675, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.04510825825855136, "learning_rate": 1.1579594345421022e-05, "loss": 0.0018, "num_tokens": 7861021.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.17413761298653385, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.06597976293414831, "learning_rate": 1.1591886908420406e-05, "loss": 0.0026, "num_tokens": 7869156.0, "reward": 1.8211956024169922, "reward_std": 0.3255726397037506, "rewards/fixed_code_pass_all_test_reward/mean": 0.4836956262588501, "rewards/fixed_code_pass_all_test_reward/std": 0.21481390297412872, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3375000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23260942101478577, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.17432208079690095, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.026016850606538355, "learning_rate": 1.1604179471419792e-05, "loss": 0.001, "num_tokens": 7877353.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.17450654860726802, "frac_reward_zero_std": 1.0, "grad_norm": 0.036376953125, "kl": 0.03598828939720988, "learning_rate": 1.1616472034419177e-05, "loss": 0.0014, "num_tokens": 7883503.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 103.625, "completions/mean_terminated_length": 103.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.17469101641763513, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.07512486074119806, "learning_rate": 1.1628764597418563e-05, "loss": 0.003, "num_tokens": 7887684.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 391.125, "completions/mean_terminated_length": 391.125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.17487548422800223, "frac_reward_zero_std": 0.0, "grad_norm": 0.7578125, "kl": 0.019258150830864906, "learning_rate": 1.1641057160417947e-05, "loss": 0.0008, "num_tokens": 7899413.0, "reward": 2.7083334922790527, "reward_std": 0.11785111576318741, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1750599520383693, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.06067376141436398, "learning_rate": 1.1653349723417333e-05, "loss": 0.0024, "num_tokens": 7904454.0, "reward": 1.289285659790039, "reward_std": 0.15381449460983276, "rewards/fixed_code_pass_all_test_reward/mean": 0.28928571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.15381447970867157, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.1752444198487364, "frac_reward_zero_std": 1.0, "grad_norm": 0.042724609375, "kl": 0.025688752066344023, "learning_rate": 1.1665642286416719e-05, "loss": 0.001, "num_tokens": 7910835.0, "reward": 1.8518519401550293, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8518518805503845, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.1754288876591035, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.03722355840727687, "learning_rate": 1.1677934849416103e-05, "loss": 0.0015, "num_tokens": 7916722.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.17561335546947057, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.04863691236823797, "learning_rate": 1.169022741241549e-05, "loss": 0.0019, "num_tokens": 7925663.0, "reward": 2.0219879150390625, "reward_std": 0.12051479518413544, "rewards/fixed_code_pass_all_test_reward/mean": 0.7469879388809204, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2750000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12051477283239365, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.17579782327983767, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.04438622482120991, "learning_rate": 1.1702519975414874e-05, "loss": 0.0018, "num_tokens": 7931097.0, "reward": 1.704545497894287, "reward_std": 0.12624818086624146, "rewards/fixed_code_pass_all_test_reward/mean": 0.7045454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.12624819576740265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 39.625, "completions/mean_terminated_length": 39.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.17598229109020475, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "kl": 0.14466728642582893, "learning_rate": 1.171481253841426e-05, "loss": 0.0058, "num_tokens": 7934046.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.17616675890057185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0234375, "kl": 0.019318212987855077, "learning_rate": 1.1727105101413646e-05, "loss": 0.0008, "num_tokens": 7942355.0, "reward": 2.7272727489471436, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 78.625, "completions/mean_terminated_length": 78.625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.17635122671093895, "frac_reward_zero_std": 1.0, "grad_norm": 0.41796875, "kl": 0.09308621939271688, "learning_rate": 1.173939766441303e-05, "loss": 0.0037, "num_tokens": 7946000.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.17653569452130602, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "kl": 0.22382903099060059, "learning_rate": 1.1751690227412416e-05, "loss": 0.009, "num_tokens": 7953309.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 113.25, "completions/mean_terminated_length": 113.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.17672016233167312, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.05930087948217988, "learning_rate": 1.17639827904118e-05, "loss": 0.0024, "num_tokens": 7960271.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 287.375, "completions/mean_terminated_length": 287.375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.17690463014204022, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.02626537950709462, "learning_rate": 1.1776275353411186e-05, "loss": 0.0011, "num_tokens": 7967674.0, "reward": 1.3359375, "reward_std": 0.0941212847828865, "rewards/fixed_code_pass_all_test_reward/mean": 0.3046875, "rewards/fixed_code_pass_all_test_reward/std": 0.022097086533904076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 102.25, "completions/mean_terminated_length": 102.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1770890979524073, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.08622897509485483, "learning_rate": 1.1788567916410572e-05, "loss": 0.0034, "num_tokens": 7971620.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.1772735657627744, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.0337520120665431, "learning_rate": 1.1800860479409958e-05, "loss": 0.0014, "num_tokens": 7979347.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.1774580335731415, "frac_reward_zero_std": 1.0, "grad_norm": 0.2421875, "kl": 0.1424331422895193, "learning_rate": 1.1813153042409344e-05, "loss": 0.0057, "num_tokens": 7982679.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.17764250138350857, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.0302286921069026, "learning_rate": 1.182544560540873e-05, "loss": 0.0012, "num_tokens": 7990776.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.17782696919387567, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.08999729622155428, "learning_rate": 1.1837738168408115e-05, "loss": 0.0036, "num_tokens": 7998241.0, "reward": 1.875, "reward_std": 0.1725163608789444, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 240.375, "completions/mean_terminated_length": 240.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.17801143700424277, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.02532421867363155, "learning_rate": 1.18500307314075e-05, "loss": 0.001, "num_tokens": 8004740.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 92.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.17819590481460984, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.13065451383590698, "learning_rate": 1.1862323294406885e-05, "loss": 0.0052, "num_tokens": 8010272.0, "reward": 2.9749999046325684, "reward_std": 0.07071065902709961, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9750000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106739282608, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 297.125, "completions/mean_terminated_length": 297.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.17838037262497694, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.02133295766543597, "learning_rate": 1.1874615857406271e-05, "loss": 0.0009, "num_tokens": 8016977.0, "reward": 1.0729167461395264, "reward_std": 0.1368400603532791, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 374.0, "completions/mean_terminated_length": 374.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.17856484043534404, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0962518691085279, "learning_rate": 1.1886908420405657e-05, "loss": 0.0039, "num_tokens": 8030977.0, "reward": 2.4375, "reward_std": 0.3925648629665375, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.44077855348587036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.1787493082457111, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.03230156050994992, "learning_rate": 1.1899200983405041e-05, "loss": 0.0013, "num_tokens": 8037228.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 415.5, "completions/mean_terminated_length": 415.5, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.17893377605607821, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.02233269263524562, "learning_rate": 1.1911493546404427e-05, "loss": 0.0009, "num_tokens": 8046720.0, "reward": 1.0892857313156128, "reward_std": 0.13087505102157593, "rewards/fixed_code_pass_all_test_reward/mean": 0.0892857164144516, "rewards/fixed_code_pass_all_test_reward/std": 0.13087506592273712, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.17911824386644531, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.04128297441639006, "learning_rate": 1.1923786109403812e-05, "loss": 0.0017, "num_tokens": 8053003.0, "reward": 1.1979167461395264, "reward_std": 0.11785116046667099, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 507.375, "completions/mean_terminated_length": 507.375, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.1793027116768124, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.017984784964937717, "learning_rate": 1.1936078672403198e-05, "loss": 0.0007, "num_tokens": 8066182.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 68.875, "completions/mean_terminated_length": 68.875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.1794871794871795, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.13462564442306757, "learning_rate": 1.1948371235402584e-05, "loss": 0.0054, "num_tokens": 8069565.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 235.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.1796716472975466, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.043509319657459855, "learning_rate": 1.1960663798401968e-05, "loss": 0.0017, "num_tokens": 8075621.0, "reward": 1.4268293380737305, "reward_std": 0.15147779881954193, "rewards/fixed_code_pass_all_test_reward/mean": 0.4268292784690857, "rewards/fixed_code_pass_all_test_reward/std": 0.15147781372070312, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.17985611510791366, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.04431539890356362, "learning_rate": 1.1972956361401354e-05, "loss": 0.0018, "num_tokens": 8081206.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 82.625, "completions/mean_terminated_length": 82.625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.18004058291828076, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.10866844141855836, "learning_rate": 1.1985248924400738e-05, "loss": 0.0043, "num_tokens": 8085459.0, "reward": 2.0374999046325684, "reward_std": 0.459619402885437, "rewards/fixed_code_pass_all_test_reward/mean": 0.9125000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2474873811006546, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.18022505072864786, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.03481556195765734, "learning_rate": 1.1997541487400124e-05, "loss": 0.0014, "num_tokens": 8094625.0, "reward": 1.9595239162445068, "reward_std": 0.04714047908782959, "rewards/fixed_code_pass_all_test_reward/mean": 0.6428571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3166666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0471404567360878, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 103.125, "completions/mean_terminated_length": 103.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.18040951853901493, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.0795166390016675, "learning_rate": 1.2009834050399508e-05, "loss": 0.0032, "num_tokens": 8099410.0, "reward": 2.2708334922790527, "reward_std": 0.2946278154850006, "rewards/fixed_code_pass_all_test_reward/mean": 0.2708333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.2946278154850006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.18059398634938204, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.025393910706043243, "learning_rate": 1.2022126613398894e-05, "loss": 0.001, "num_tokens": 8106032.0, "reward": 1.6607142686843872, "reward_std": 0.10628914088010788, "rewards/fixed_code_pass_all_test_reward/mean": 0.660714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.10628912597894669, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 63.125, "completions/mean_terminated_length": 63.125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.18077845415974914, "frac_reward_zero_std": 0.0, "grad_norm": 5.65625, "kl": 0.18063935171812773, "learning_rate": 1.203441917639828e-05, "loss": 0.0072, "num_tokens": 8109329.0, "reward": 1.3333333730697632, "reward_std": 0.4714045226573944, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0833333358168602, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022911310196, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.1809629219701162, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.06980455992743373, "learning_rate": 1.2046711739397665e-05, "loss": 0.0028, "num_tokens": 8113962.0, "reward": 2.799999952316284, "reward_std": 0.37032803893089294, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.37032803893089294, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 127.875, "completions/mean_terminated_length": 127.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.1811473897804833, "frac_reward_zero_std": 0.0, "grad_norm": 4.40625, "kl": 0.06414887076243758, "learning_rate": 1.205900430239705e-05, "loss": 0.0026, "num_tokens": 8120809.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 135.75, "completions/mean_terminated_length": 135.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1813318575908504, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.03459434537217021, "learning_rate": 1.2071296865396435e-05, "loss": 0.0014, "num_tokens": 8126903.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.18151632540121748, "frac_reward_zero_std": 0.0, "grad_norm": 4.34375, "kl": 0.12292393576353788, "learning_rate": 1.2083589428395821e-05, "loss": 0.0049, "num_tokens": 8130189.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 190.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.18170079321158458, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.04055106081068516, "learning_rate": 1.2095881991395207e-05, "loss": 0.0016, "num_tokens": 8137972.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.18188526102195168, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.05335805076174438, "learning_rate": 1.2108174554394591e-05, "loss": 0.0021, "num_tokens": 8148446.0, "reward": 1.6578948497772217, "reward_std": 0.30860671401023865, "rewards/fixed_code_pass_all_test_reward/mean": 0.15789473056793213, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.30860671401023865, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 297.25, "completions/mean_terminated_length": 297.25, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.18206972883231876, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.030610819114372134, "learning_rate": 1.2120467117393977e-05, "loss": 0.0012, "num_tokens": 8155360.0, "reward": 1.8197115659713745, "reward_std": 0.343232125043869, "rewards/fixed_code_pass_all_test_reward/mean": 0.9134615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.05341270938515663, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 406.0, "completions/mean_terminated_length": 406.0, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.18225419664268586, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.02474592998623848, "learning_rate": 1.2132759680393362e-05, "loss": 0.001, "num_tokens": 8163840.0, "reward": 1.9047619104385376, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9047619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.18243866445305293, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.040815221378579736, "learning_rate": 1.2145052243392748e-05, "loss": 0.0016, "num_tokens": 8171866.0, "reward": 2.439393997192383, "reward_std": 0.4741791784763336, "rewards/fixed_code_pass_all_test_reward/mean": 0.43939393758773804, "rewards/fixed_code_pass_all_test_reward/std": 0.4741791784763336, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 80.375, "completions/mean_terminated_length": 80.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.18262313226342003, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.0621514325030148, "learning_rate": 1.2157344806392134e-05, "loss": 0.0025, "num_tokens": 8175373.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.18280760007378713, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.04342365823686123, "learning_rate": 1.2169637369391518e-05, "loss": 0.0017, "num_tokens": 8184092.0, "reward": 2.375, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 106.5, "completions/mean_terminated_length": 106.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1829920678841542, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.0539030316285789, "learning_rate": 1.2181929932390904e-05, "loss": 0.0022, "num_tokens": 8188176.0, "reward": 2.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 823.375, "completions/mean_terminated_length": 823.375, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 0.1831765356945213, "frac_reward_zero_std": 1.0, "grad_norm": 0.034423828125, "kl": 0.01368710957467556, "learning_rate": 1.2194222495390288e-05, "loss": 0.0005, "num_tokens": 8203419.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 314.875, "completions/mean_terminated_length": 314.875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.1833610035048884, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.033438652753829956, "learning_rate": 1.2206515058389676e-05, "loss": 0.0013, "num_tokens": 8211082.0, "reward": 1.375, "reward_std": 0.25253811478614807, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.18354547131525548, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.03671476570889354, "learning_rate": 1.2218807621389062e-05, "loss": 0.0015, "num_tokens": 8219570.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 435.25, "completions/mean_terminated_length": 435.25, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.18372993912562258, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.014276882720878348, "learning_rate": 1.2231100184388446e-05, "loss": 0.0006, "num_tokens": 8232644.0, "reward": 2.0591793060302734, "reward_std": 0.26030871272087097, "rewards/fixed_code_pass_all_test_reward/mean": 0.7913222908973694, "rewards/fixed_code_pass_all_test_reward/std": 0.22833816707134247, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2678571343421936, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.05050762742757797, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 456.75, "completions/mean_terminated_length": 456.75, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.18391440693598968, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.16299163014627993, "learning_rate": 1.2243392747387832e-05, "loss": 0.0065, "num_tokens": 8246346.0, "reward": 2.59375, "reward_std": 1.0516780614852905, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.84375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35197150707244873, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.18409887474635675, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.03414763626642525, "learning_rate": 1.2255685310387218e-05, "loss": 0.0014, "num_tokens": 8253461.0, "reward": 0.9583333730697632, "reward_std": 0.6025738716125488, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.10286889970302582, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 1116.0, "completions/mean_terminated_length": 1116.0, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 0.18428334255672385, "frac_reward_zero_std": 0.0, "grad_norm": 0.58984375, "kl": 0.0072173890948761255, "learning_rate": 1.2267977873386603e-05, "loss": 0.0003, "num_tokens": 8272413.0, "reward": 1.5507246255874634, "reward_std": 0.7307814359664917, "rewards/fixed_code_pass_all_test_reward/mean": 0.6757246255874634, "rewards/fixed_code_pass_all_test_reward/std": 0.4647336006164551, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 404.25, "completions/mean_terminated_length": 404.25, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.18446781036709095, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.021475051529705524, "learning_rate": 1.2280270436385989e-05, "loss": 0.0009, "num_tokens": 8281015.0, "reward": 1.822115421295166, "reward_std": 0.02670634165406227, "rewards/fixed_code_pass_all_test_reward/mean": 0.8221153616905212, "rewards/fixed_code_pass_all_test_reward/std": 0.02670636959373951, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 141.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.18465227817745802, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.08033758029341698, "learning_rate": 1.2292562999385373e-05, "loss": 0.0032, "num_tokens": 8288502.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 159.125, "completions/mean_terminated_length": 159.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.18483674598782512, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05688095488585532, "learning_rate": 1.2304855562384759e-05, "loss": 0.0023, "num_tokens": 8295567.0, "reward": 1.8058712482452393, "reward_std": 0.6805416345596313, "rewards/fixed_code_pass_all_test_reward/mean": 0.9829545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.023524971678853035, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1979166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16629423201084137, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.18502121379819222, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05125757725909352, "learning_rate": 1.2317148125384145e-05, "loss": 0.0021, "num_tokens": 8304889.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.1852056816085593, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.10633875615894794, "learning_rate": 1.232944068838353e-05, "loss": 0.0043, "num_tokens": 8311597.0, "reward": 2.03125, "reward_std": 0.4712729752063751, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.1853901494189264, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.02354512456804514, "learning_rate": 1.2341733251382915e-05, "loss": 0.0009, "num_tokens": 8318119.0, "reward": 1.8110888004302979, "reward_std": 0.13770726323127747, "rewards/fixed_code_pass_all_test_reward/mean": 0.6881720423698425, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12291666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13770727813243866, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.1855746172292935, "frac_reward_zero_std": 1.0, "grad_norm": 1.1484375, "kl": 0.10824422957375646, "learning_rate": 1.23540258143823e-05, "loss": 0.0043, "num_tokens": 8322503.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.18575908503966057, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.025966209243051708, "learning_rate": 1.2366318377381685e-05, "loss": 0.001, "num_tokens": 8334573.0, "reward": 2.8521769046783447, "reward_std": 0.3025483191013336, "rewards/fixed_code_pass_all_test_reward/mean": 0.8834269642829895, "rewards/fixed_code_pass_all_test_reward/std": 0.3033967912197113, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 542.75, "completions/mean_terminated_length": 542.75, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.18594355285002767, "frac_reward_zero_std": 0.0, "grad_norm": 0.53515625, "kl": 0.013649103115312755, "learning_rate": 1.237861094038107e-05, "loss": 0.0005, "num_tokens": 8354187.0, "reward": 2.104032278060913, "reward_std": 0.36925774812698364, "rewards/fixed_code_pass_all_test_reward/mean": 0.12903225421905518, "rewards/fixed_code_pass_all_test_reward/std": 0.3521050810813904, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9750000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106739282608, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 384.5, "completions/mean_terminated_length": 384.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.18612802066039477, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.056502555147744715, "learning_rate": 1.2390903503380456e-05, "loss": 0.0023, "num_tokens": 8366247.0, "reward": 2.3709349632263184, "reward_std": 0.4697980284690857, "rewards/fixed_code_pass_all_test_reward/mean": 0.5792682766914368, "rewards/fixed_code_pass_all_test_reward/std": 0.17234183847904205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39591163396835327, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.18631248847076184, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.05896757682785392, "learning_rate": 1.2403196066379842e-05, "loss": 0.0024, "num_tokens": 8370052.0, "reward": 1.649999976158142, "reward_std": 0.5424810647964478, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.18649695628112894, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.04294520849362016, "learning_rate": 1.2415488629379226e-05, "loss": 0.0017, "num_tokens": 8378042.0, "reward": 2.5, "reward_std": 1.0690449476242065, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.18668142409149605, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.08582831593230367, "learning_rate": 1.2427781192378612e-05, "loss": 0.0034, "num_tokens": 8385717.0, "reward": 2.637500047683716, "reward_std": 0.48715198040008545, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.29806843400001526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8458333015441895, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25130611658096313, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 342.875, "completions/mean_terminated_length": 342.875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.18686589190186312, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.13509508315473795, "learning_rate": 1.2440073755377996e-05, "loss": 0.0054, "num_tokens": 8395364.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.18705035971223022, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.04895245190709829, "learning_rate": 1.2452366318377382e-05, "loss": 0.002, "num_tokens": 8402668.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 476.5, "completions/mean_terminated_length": 476.5, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.18723482752259732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.022500405088067055, "learning_rate": 1.2464658881376768e-05, "loss": 0.0009, "num_tokens": 8412408.0, "reward": 1.3793103694915771, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.37931033968925476, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 78.75, "completions/mean_terminated_length": 78.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1874192953329644, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.052191125927492976, "learning_rate": 1.2476951444376153e-05, "loss": 0.0021, "num_tokens": 8415774.0, "reward": 1.2083332538604736, "reward_std": 0.2969420850276947, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2969421148300171, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.1876037631433315, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0317008065758273, "learning_rate": 1.2489244007375539e-05, "loss": 0.0013, "num_tokens": 8420270.0, "reward": 2.0250000953674316, "reward_std": 0.07071065902709961, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.1877882309536986, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.0362692263443023, "learning_rate": 1.2501536570374923e-05, "loss": 0.0015, "num_tokens": 8430116.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.18797269876406567, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.09473582357168198, "learning_rate": 1.2513829133374309e-05, "loss": 0.0038, "num_tokens": 8435684.0, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.18815716657443277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.01881978940218687, "learning_rate": 1.2526121696373695e-05, "loss": 0.0008, "num_tokens": 8441145.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.18834163438479984, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.03482384537346661, "learning_rate": 1.253841425937308e-05, "loss": 0.0014, "num_tokens": 8446897.0, "reward": 2.950000047683716, "reward_std": 0.09258199483156204, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.949999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.18852610219516694, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.021303610177710652, "learning_rate": 1.2550706822372465e-05, "loss": 0.0009, "num_tokens": 8458345.0, "reward": 2.5198864936828613, "reward_std": 0.26208609342575073, "rewards/fixed_code_pass_all_test_reward/mean": 0.6136363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.18018747866153717, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 373.375, "completions/mean_terminated_length": 373.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.18871057000553404, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.0377312395721674, "learning_rate": 1.256299938537185e-05, "loss": 0.0015, "num_tokens": 8468492.0, "reward": 2.5250000953674316, "reward_std": 0.7401737570762634, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6499999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4869731366634369, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1888950378159011, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.017899554659379646, "learning_rate": 1.2575291948371236e-05, "loss": 0.0007, "num_tokens": 8473104.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 770.875, "completions/mean_terminated_length": 770.875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.1890795056262682, "frac_reward_zero_std": 0.0, "grad_norm": 4.0, "kl": 0.012986215704586357, "learning_rate": 1.258758451137062e-05, "loss": 0.0005, "num_tokens": 8488007.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 214.625, "completions/mean_terminated_length": 214.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.1892639734366353, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.09162075212225318, "learning_rate": 1.2599877074370006e-05, "loss": 0.0037, "num_tokens": 8495564.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 456.125, "completions/mean_terminated_length": 456.125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.18944844124700239, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.021230397862382233, "learning_rate": 1.2612169637369394e-05, "loss": 0.0008, "num_tokens": 8504005.0, "reward": 1.1374999284744263, "reward_std": 0.16372403502464294, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13750000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16372402012348175, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.1896329090573695, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0351751702837646, "learning_rate": 1.262446220036878e-05, "loss": 0.0014, "num_tokens": 8510695.0, "reward": 1.7946970462799072, "reward_std": 0.50639808177948, "rewards/fixed_code_pass_all_test_reward/mean": 0.7196969985961914, "rewards/fixed_code_pass_all_test_reward/std": 0.40421175956726074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2121320515871048, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.1898173768677366, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.08062097756192088, "learning_rate": 1.2636754763368164e-05, "loss": 0.0032, "num_tokens": 8518882.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.19000184467810366, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.026601277524605393, "learning_rate": 1.264904732636755e-05, "loss": 0.0011, "num_tokens": 8523878.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.19018631248847076, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.0740852365270257, "learning_rate": 1.2661339889366934e-05, "loss": 0.003, "num_tokens": 8531299.0, "reward": 2.9583334922790527, "reward_std": 0.11785107105970383, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 109.625, "completions/mean_terminated_length": 109.625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.19037078029883786, "frac_reward_zero_std": 0.0, "grad_norm": 3.4375, "kl": 0.03854954184498638, "learning_rate": 1.267363245236632e-05, "loss": 0.0015, "num_tokens": 8534960.0, "reward": 1.024999976158142, "reward_std": 0.539179265499115, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 128.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.19055524810920493, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.04990371782332659, "learning_rate": 1.2685925015365706e-05, "loss": 0.002, "num_tokens": 8538784.0, "reward": 2.0625, "reward_std": 0.1157275140285492, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1157275140285492, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.19073971591957203, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.020169257652014494, "learning_rate": 1.269821757836509e-05, "loss": 0.0008, "num_tokens": 8544869.0, "reward": 2.245833396911621, "reward_std": 0.23566016554832458, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23566018044948578, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.19092418372993913, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.04427777184173465, "learning_rate": 1.2710510141364477e-05, "loss": 0.0018, "num_tokens": 8553497.0, "reward": 2.769230842590332, "reward_std": 0.3359295427799225, "rewards/fixed_code_pass_all_test_reward/mean": 0.8942307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.29916056990623474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 112.125, "completions/mean_terminated_length": 112.125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.1911086515403062, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.042717999895103276, "learning_rate": 1.272280270436386e-05, "loss": 0.0017, "num_tokens": 8557090.0, "reward": 2.0020833015441895, "reward_std": 0.5202592015266418, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25208336114883423, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23493288457393646, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.1912931193506733, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.04282675450667739, "learning_rate": 1.2735095267363247e-05, "loss": 0.0017, "num_tokens": 8565670.0, "reward": 1.5625, "reward_std": 0.3204349875450134, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.1914775871610404, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.04839248047210276, "learning_rate": 1.2747387830362631e-05, "loss": 0.0019, "num_tokens": 8574521.0, "reward": 1.681746006011963, "reward_std": 0.43989163637161255, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6817460060119629, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.43989166617393494, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.19166205497140748, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.01870358595624566, "learning_rate": 1.2759680393362017e-05, "loss": 0.0007, "num_tokens": 8581223.0, "reward": 1.3809840679168701, "reward_std": 0.18600594997406006, "rewards/fixed_code_pass_all_test_reward/mean": 0.28723403811454773, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18600596487522125, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.19184652278177458, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.08503155270591378, "learning_rate": 1.2771972956361403e-05, "loss": 0.0034, "num_tokens": 8590968.0, "reward": 2.4166667461395264, "reward_std": 0.40335899591445923, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.40335893630981445, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.19203099059214168, "frac_reward_zero_std": 1.0, "grad_norm": 0.046875, "kl": 0.03625491191633046, "learning_rate": 1.2784265519360787e-05, "loss": 0.0015, "num_tokens": 8600671.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 131.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.19221545840250875, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.031101613072678447, "learning_rate": 1.2796558082360173e-05, "loss": 0.0012, "num_tokens": 8604551.0, "reward": 2.727083444595337, "reward_std": 0.31571754813194275, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7270833253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.31571757793426514, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.19239992621287585, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.04393464466556907, "learning_rate": 1.2808850645359558e-05, "loss": 0.0018, "num_tokens": 8611458.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.19258439402324296, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.049243778456002474, "learning_rate": 1.2821143208358944e-05, "loss": 0.002, "num_tokens": 8619294.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 361.375, "completions/mean_terminated_length": 361.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.19276886183361003, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.02298436057753861, "learning_rate": 1.283343577135833e-05, "loss": 0.0009, "num_tokens": 8626705.0, "reward": 1.7961957454681396, "reward_std": 0.3357647955417633, "rewards/fixed_code_pass_all_test_reward/mean": 0.7336956262588501, "rewards/fixed_code_pass_all_test_reward/std": 0.2613220810890198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.19295332964397713, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.023431531968526542, "learning_rate": 1.2845728334357714e-05, "loss": 0.0009, "num_tokens": 8632767.0, "reward": 1.533018946647644, "reward_std": 0.21683460474014282, "rewards/fixed_code_pass_all_test_reward/mean": 0.5330188870429993, "rewards/fixed_code_pass_all_test_reward/std": 0.21683460474014282, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 564.25, "completions/mean_terminated_length": 564.25, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.19313779745434423, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.023436830786522478, "learning_rate": 1.28580208973571e-05, "loss": 0.0009, "num_tokens": 8643617.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.1933222652647113, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.055624571396037936, "learning_rate": 1.2870313460356484e-05, "loss": 0.0022, "num_tokens": 8653251.0, "reward": 1.9895832538604736, "reward_std": 0.008625819347798824, "rewards/fixed_code_pass_all_test_reward/mean": 0.9895833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.008625810965895653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.1935067330750784, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.02143107366282493, "learning_rate": 1.288260602335587e-05, "loss": 0.0009, "num_tokens": 8659577.0, "reward": 1.743749976158142, "reward_std": 0.548659086227417, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11874999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13076014816761017, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 544.875, "completions/mean_terminated_length": 544.875, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.1936912008854455, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.020050220540724695, "learning_rate": 1.2894898586355256e-05, "loss": 0.0008, "num_tokens": 8669680.0, "reward": 1.3901580572128296, "reward_std": 0.34537604451179504, "rewards/fixed_code_pass_all_test_reward/mean": 0.26724135875701904, "rewards/fixed_code_pass_all_test_reward/std": 0.3270023465156555, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12291666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13770727813243866, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 168.625, "completions/mean_terminated_length": 168.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.19387566869581258, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.068915456533432, "learning_rate": 1.290719114935464e-05, "loss": 0.0028, "num_tokens": 8677973.0, "reward": 2.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.19406013650617968, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.011694400338456035, "learning_rate": 1.2919483712354027e-05, "loss": 0.0005, "num_tokens": 8681891.0, "reward": 2.5520832538604736, "reward_std": 0.1355287730693817, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5520833730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1355288028717041, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 183.75, "completions/mean_terminated_length": 183.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.19424460431654678, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.029141453560441732, "learning_rate": 1.2931776275353411e-05, "loss": 0.0012, "num_tokens": 8690849.0, "reward": 2.012500047683716, "reward_std": 0.24164614081382751, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.949999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213478565216, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 106.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.19442907212691385, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.018146722111850977, "learning_rate": 1.2944068838352797e-05, "loss": 0.0007, "num_tokens": 8694735.0, "reward": 2.081249952316284, "reward_std": 0.3565083146095276, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20625001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0176776684820652, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.19461353993728095, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.02683996601263061, "learning_rate": 1.2956361401352183e-05, "loss": 0.0011, "num_tokens": 8698846.0, "reward": 1.9375, "reward_std": 0.3977864980697632, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12400397658348083, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 81.5, "completions/mean_terminated_length": 81.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.19479800774764802, "frac_reward_zero_std": 1.0, "grad_norm": 1.15625, "kl": 0.12187173892743886, "learning_rate": 1.2968653964351567e-05, "loss": 0.0049, "num_tokens": 8702154.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.19498247555801512, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.020637819194234908, "learning_rate": 1.2980946527350953e-05, "loss": 0.0008, "num_tokens": 8706386.0, "reward": 1.9500000476837158, "reward_std": 0.6824326515197754, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20000000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.37032803893089294, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 316.375, "completions/mean_terminated_length": 316.375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.19516694336838222, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.015579932020045817, "learning_rate": 1.2993239090350338e-05, "loss": 0.0006, "num_tokens": 8713477.0, "reward": 1.6643410921096802, "reward_std": 0.29524630308151245, "rewards/fixed_code_pass_all_test_reward/mean": 0.447674423456192, "rewards/fixed_code_pass_all_test_reward/std": 0.18088778853416443, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15092308819293976, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.1953514111787493, "frac_reward_zero_std": 1.0, "grad_norm": 0.05029296875, "kl": 0.026423858711495996, "learning_rate": 1.3005531653349725e-05, "loss": 0.0011, "num_tokens": 8718702.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.1955358789891164, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.028936480870470405, "learning_rate": 1.3017824216349111e-05, "loss": 0.0012, "num_tokens": 8722420.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 313.125, "completions/mean_terminated_length": 313.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1957203467994835, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.026925568585284054, "learning_rate": 1.3030116779348496e-05, "loss": 0.0011, "num_tokens": 8729141.0, "reward": 1.3799424171447754, "reward_std": 0.14124147593975067, "rewards/fixed_code_pass_all_test_reward/mean": 0.28723403811454773, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09270833432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14124149084091187, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 377.125, "completions/mean_terminated_length": 377.125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.19590481460985057, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.055639876052737236, "learning_rate": 1.3042409342347882e-05, "loss": 0.0022, "num_tokens": 8737182.0, "reward": 1.091299057006836, "reward_std": 0.13551665842533112, "rewards/fixed_code_pass_all_test_reward/mean": 0.018382353708148003, "rewards/fixed_code_pass_all_test_reward/std": 0.051993150264024734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.19608928242021767, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.02650094369892031, "learning_rate": 1.3054701905347268e-05, "loss": 0.0011, "num_tokens": 8747077.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.19627375023058477, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.014456832723226398, "learning_rate": 1.3066994468346652e-05, "loss": 0.0006, "num_tokens": 8751961.0, "reward": 2.21875, "reward_std": 0.6642478704452515, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.46875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23306745290756226, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.19645821804095184, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.027336691971868277, "learning_rate": 1.3079287031346038e-05, "loss": 0.0011, "num_tokens": 8764301.0, "reward": 1.9989224672317505, "reward_std": 0.08748288452625275, "rewards/fixed_code_pass_all_test_reward/mean": 0.03017241321504116, "rewards/fixed_code_pass_all_test_reward/std": 0.012191496789455414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.19664268585131894, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.05693602608516812, "learning_rate": 1.3091579594345422e-05, "loss": 0.0023, "num_tokens": 8768258.0, "reward": 1.6124999523162842, "reward_std": 0.4933485984802246, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.48750001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.43156692385673523, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 224.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.19682715366168604, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.04811902204528451, "learning_rate": 1.3103872157344808e-05, "loss": 0.0019, "num_tokens": 8778354.0, "reward": 2.450000047683716, "reward_std": 0.5424810647964478, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.949999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.125, "completions/mean_terminated_length": 123.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.19701162147205312, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.0514000435359776, "learning_rate": 1.3116164720344192e-05, "loss": 0.0021, "num_tokens": 8787419.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.19719608928242022, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.05648013553582132, "learning_rate": 1.3128457283343578e-05, "loss": 0.0023, "num_tokens": 8795392.0, "reward": 2.268749952316284, "reward_std": 1.0619851350784302, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.643750011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4065864086151123, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 321.625, "completions/mean_terminated_length": 321.625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.19738055709278732, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.024752994650043547, "learning_rate": 1.3140749846342964e-05, "loss": 0.001, "num_tokens": 8802189.0, "reward": 1.8540890216827393, "reward_std": 0.1720172017812729, "rewards/fixed_code_pass_all_test_reward/mean": 0.7862318754196167, "rewards/fixed_code_pass_all_test_reward/std": 0.07173547148704529, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06785714626312256, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14321385324001312, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.1975650249031544, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.010919177380856127, "learning_rate": 1.3153042409342349e-05, "loss": 0.0004, "num_tokens": 8807373.0, "reward": 2.231250047683716, "reward_std": 0.06925775855779648, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.26249998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1452283412218094, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 309.5, "completions/mean_terminated_length": 309.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.1977494927135215, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.013536189449951053, "learning_rate": 1.3165334972341735e-05, "loss": 0.0005, "num_tokens": 8813561.0, "reward": 2.7708334922790527, "reward_std": 0.5265754461288452, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8958333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19795581698417664, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.1979339605238886, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.05151916155591607, "learning_rate": 1.3177627535341119e-05, "loss": 0.0021, "num_tokens": 8822880.0, "reward": 1.5348684787750244, "reward_std": 0.7334721088409424, "rewards/fixed_code_pass_all_test_reward/mean": 0.32236841320991516, "rewards/fixed_code_pass_all_test_reward/std": 0.3213021457195282, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3375000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23260942101478577, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.19811842833425566, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.02484806114807725, "learning_rate": 1.3189920098340505e-05, "loss": 0.001, "num_tokens": 8832006.0, "reward": 2.4461207389831543, "reward_std": 0.04461633041501045, "rewards/fixed_code_pass_all_test_reward/mean": 0.9461206793785095, "rewards/fixed_code_pass_all_test_reward/std": 0.04461631923913956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 264.625, "completions/mean_terminated_length": 264.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.19830289614462276, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.02321019978262484, "learning_rate": 1.3202212661339891e-05, "loss": 0.0009, "num_tokens": 8843091.0, "reward": 2.4847559928894043, "reward_std": 0.2791401743888855, "rewards/fixed_code_pass_all_test_reward/mean": 0.48475608229637146, "rewards/fixed_code_pass_all_test_reward/std": 0.27914005517959595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.19848736395498986, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.056351692881435156, "learning_rate": 1.3214505224339275e-05, "loss": 0.0023, "num_tokens": 8852878.0, "reward": 2.4375, "reward_std": 0.7288689613342285, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 114.125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.19867183176535694, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.029623490758240223, "learning_rate": 1.3226797787338661e-05, "loss": 0.0012, "num_tokens": 8856823.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.19885629957572404, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.042538136593066156, "learning_rate": 1.3239090350338046e-05, "loss": 0.0017, "num_tokens": 8862529.0, "reward": 1.3392857313156128, "reward_std": 0.4107697308063507, "rewards/fixed_code_pass_all_test_reward/mean": 0.3392857015132904, "rewards/fixed_code_pass_all_test_reward/std": 0.4107697308063507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 107.375, "completions/mean_terminated_length": 107.375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.19904076738609114, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.02793034992646426, "learning_rate": 1.3251382913337432e-05, "loss": 0.0011, "num_tokens": 8866164.0, "reward": 2.1875, "reward_std": 0.17060820758342743, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17060816287994385, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 175.625, "completions/mean_terminated_length": 175.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.1992252351964582, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.05746294092386961, "learning_rate": 1.3263675476336818e-05, "loss": 0.0023, "num_tokens": 8874785.0, "reward": 2.03125, "reward_std": 0.890801191329956, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.53125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5077524185180664, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 84.75, "completions/mean_terminated_length": 84.75, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.1994097030068253, "frac_reward_zero_std": 0.0, "grad_norm": 4.3125, "kl": 0.1758517725393176, "learning_rate": 1.3275968039336202e-05, "loss": 0.007, "num_tokens": 8882975.0, "reward": 2.5847458839416504, "reward_std": 0.3653206527233124, "rewards/fixed_code_pass_all_test_reward/mean": 0.5847457647323608, "rewards/fixed_code_pass_all_test_reward/std": 0.36532068252563477, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.1995941708171924, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.02085600991267711, "learning_rate": 1.3288260602335588e-05, "loss": 0.0008, "num_tokens": 8889751.0, "reward": 1.65625, "reward_std": 0.2680140733718872, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.24915467202663422, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.19977863862755948, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.07837783708237112, "learning_rate": 1.3300553165334972e-05, "loss": 0.0031, "num_tokens": 8897836.0, "reward": 2.2174417972564697, "reward_std": 0.13171322643756866, "rewards/fixed_code_pass_all_test_reward/mean": 0.26744186878204346, "rewards/fixed_code_pass_all_test_reward/std": 0.06921151280403137, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.949999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 214.0, "completions/mean_terminated_length": 214.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.19996310643792659, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.12986951507627964, "learning_rate": 1.3312845728334358e-05, "loss": 0.0052, "num_tokens": 8906324.0, "reward": 1.6812500953674316, "reward_std": 0.7831276059150696, "rewards/fixed_code_pass_all_test_reward/mean": 0.5812499523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.4644024670124054, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22500000894069672, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11233454197645187, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.20014757424829369, "frac_reward_zero_std": 0.0, "grad_norm": 18.0, "kl": 0.028280748752877116, "learning_rate": 1.3325138291333744e-05, "loss": 0.0011, "num_tokens": 8913550.0, "reward": 1.6921769380569458, "reward_std": 0.7343316674232483, "rewards/fixed_code_pass_all_test_reward/mean": 0.5255101919174194, "rewards/fixed_code_pass_all_test_reward/std": 0.34179866313934326, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.20033204205866076, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.07501635048538446, "learning_rate": 1.3337430854333129e-05, "loss": 0.003, "num_tokens": 8921238.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 529.125, "completions/mean_terminated_length": 529.125, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.20051650986902786, "frac_reward_zero_std": 0.0, "grad_norm": 0.7265625, "kl": 0.02116880047833547, "learning_rate": 1.3349723417332515e-05, "loss": 0.0008, "num_tokens": 8935919.0, "reward": 2.422916889190674, "reward_std": 0.5874978303909302, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.48065248131752014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7666666507720947, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3270842730998993, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.20070097767939493, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.03530742647126317, "learning_rate": 1.3362015980331899e-05, "loss": 0.0014, "num_tokens": 8943980.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.20088544548976203, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.03483016858808696, "learning_rate": 1.3374308543331285e-05, "loss": 0.0014, "num_tokens": 8951774.0, "reward": 1.4562499523162842, "reward_std": 0.19899298250675201, "rewards/fixed_code_pass_all_test_reward/mean": 0.45625001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.1989930272102356, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 443.875, "completions/mean_terminated_length": 443.875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.20106991330012913, "frac_reward_zero_std": 0.0, "grad_norm": 0.77734375, "kl": 0.015524210873991251, "learning_rate": 1.338660110633067e-05, "loss": 0.0006, "num_tokens": 8966269.0, "reward": 2.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2012543811104962, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.07649559993296862, "learning_rate": 1.3398893669330055e-05, "loss": 0.0031, "num_tokens": 8972579.0, "reward": 2.933333396911621, "reward_std": 0.12848316133022308, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9333333373069763, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12848322093486786, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 109.5, "completions/mean_terminated_length": 109.5, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.2014388489208633, "frac_reward_zero_std": 0.0, "grad_norm": 4.15625, "kl": 0.039801574079319835, "learning_rate": 1.3411186232329443e-05, "loss": 0.0016, "num_tokens": 8976335.0, "reward": 2.1812500953674316, "reward_std": 0.3463456630706787, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18125000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3463457524776459, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2016233167312304, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.020043171825818717, "learning_rate": 1.3423478795328829e-05, "loss": 0.0008, "num_tokens": 8980382.0, "reward": 1.28125, "reward_std": 0.3358757197856903, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1801537126302719, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.20180778454159748, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.07923999824561179, "learning_rate": 1.3435771358328213e-05, "loss": 0.0032, "num_tokens": 8988677.0, "reward": 2.7395832538604736, "reward_std": 0.37117770314216614, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7395833730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.371177613735199, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.20199225235196458, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.03036778053501621, "learning_rate": 1.34480639213276e-05, "loss": 0.0012, "num_tokens": 8992830.0, "reward": 2.5833334922790527, "reward_std": 1.0503212213516235, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8333333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3563483655452728, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 150.75, "completions/mean_terminated_length": 150.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.20217672016233168, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.02486209070775658, "learning_rate": 1.3460356484326984e-05, "loss": 0.001, "num_tokens": 9000660.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.20236118797269875, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.024514168268069625, "learning_rate": 1.347264904732637e-05, "loss": 0.001, "num_tokens": 9004718.0, "reward": 0.9416666626930237, "reward_std": 0.4006938338279724, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06666667014360428, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12848322093486786, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 407.125, "completions/mean_terminated_length": 407.125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.20254565578306585, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.038073220290243626, "learning_rate": 1.3484941610325754e-05, "loss": 0.0015, "num_tokens": 9015743.0, "reward": 2.5250000953674316, "reward_std": 0.5119988918304443, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8999999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2828427255153656, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.20273012359343295, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.03500075382180512, "learning_rate": 1.349723417332514e-05, "loss": 0.0014, "num_tokens": 9023913.0, "reward": 2.3333334922790527, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 82.875, "completions/mean_terminated_length": 82.875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.20291459140380003, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.05146819236688316, "learning_rate": 1.3509526736324526e-05, "loss": 0.0021, "num_tokens": 9027448.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.20309905921416713, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.02709308301564306, "learning_rate": 1.352181929932391e-05, "loss": 0.0011, "num_tokens": 9036527.0, "reward": 2.335970640182495, "reward_std": 0.040278226137161255, "rewards/fixed_code_pass_all_test_reward/mean": 0.6693037748336792, "rewards/fixed_code_pass_all_test_reward/std": 0.040278226137161255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 89.25, "completions/mean_terminated_length": 89.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.20328352702453423, "frac_reward_zero_std": 1.0, "grad_norm": 0.240234375, "kl": 0.06274432060308754, "learning_rate": 1.3534111862323296e-05, "loss": 0.0025, "num_tokens": 9040233.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.2034679948349013, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.01419017935404554, "learning_rate": 1.354640442532268e-05, "loss": 0.0006, "num_tokens": 9045294.0, "reward": 1.631250023841858, "reward_std": 0.22028794884681702, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6312500238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2202879637479782, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2036524626452684, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.02011757146101445, "learning_rate": 1.3558696988322066e-05, "loss": 0.0008, "num_tokens": 9051836.0, "reward": 1.0543477535247803, "reward_std": 0.045004263520240784, "rewards/fixed_code_pass_all_test_reward/mean": 0.05434782803058624, "rewards/fixed_code_pass_all_test_reward/std": 0.04500427842140198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 157.125, "completions/mean_terminated_length": 157.125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.2038369304556355, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "kl": 0.04315241181757301, "learning_rate": 1.3570989551321452e-05, "loss": 0.0017, "num_tokens": 9056813.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.20402139826600257, "frac_reward_zero_std": 0.0, "grad_norm": 0.431640625, "kl": 0.030162883456796408, "learning_rate": 1.3583282114320837e-05, "loss": 0.0012, "num_tokens": 9066773.0, "reward": 2.066666603088379, "reward_std": 0.41096094250679016, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9416666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1649915724992752, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.20420586607636967, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.12312188278883696, "learning_rate": 1.3595574677320223e-05, "loss": 0.0049, "num_tokens": 9074456.0, "reward": 2.408750057220459, "reward_std": 0.1476905643939972, "rewards/fixed_code_pass_all_test_reward/mean": 0.9399999380111694, "rewards/fixed_code_pass_all_test_reward/std": 0.08280788362026215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.46875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 314.375, "completions/mean_terminated_length": 314.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.20439033388673677, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.04543935926631093, "learning_rate": 1.3607867240319607e-05, "loss": 0.0018, "num_tokens": 9082819.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 386.125, "completions/mean_terminated_length": 386.125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.20457480169710385, "frac_reward_zero_std": 1.0, "grad_norm": 0.212890625, "kl": 0.030635859817266464, "learning_rate": 1.3620159803318993e-05, "loss": 0.0012, "num_tokens": 9090732.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 401.625, "completions/mean_terminated_length": 401.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.20475926950747095, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.017237565654795617, "learning_rate": 1.3632452366318379e-05, "loss": 0.0007, "num_tokens": 9098625.0, "reward": 1.2352941036224365, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.23529411852359772, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 236.0, "completions/mean_terminated_length": 236.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.20494373731783805, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052734375, "kl": 0.02228181599639356, "learning_rate": 1.3644744929317763e-05, "loss": 0.0009, "num_tokens": 9106393.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.20512820512820512, "frac_reward_zero_std": 0.0, "grad_norm": 0.640625, "kl": 0.017271390184760094, "learning_rate": 1.365703749231715e-05, "loss": 0.0007, "num_tokens": 9120371.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.20531267293857222, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.024453317222651094, "learning_rate": 1.3669330055316534e-05, "loss": 0.001, "num_tokens": 9125243.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 364.5, "completions/mean_terminated_length": 364.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.20549714074893932, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.05297210707794875, "learning_rate": 1.368162261831592e-05, "loss": 0.0021, "num_tokens": 9137983.0, "reward": 2.237499952316284, "reward_std": 0.44057589769363403, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.3854496479034424, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2056816085593064, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.02400393330026418, "learning_rate": 1.3693915181315306e-05, "loss": 0.001, "num_tokens": 9142167.0, "reward": 1.1849747896194458, "reward_std": 0.11445502191781998, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1849747598171234, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11445504426956177, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 356.625, "completions/mean_terminated_length": 356.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.2058660763696735, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.02804691786877811, "learning_rate": 1.370620774431469e-05, "loss": 0.0011, "num_tokens": 9149948.0, "reward": 1.6015625, "reward_std": 0.4418628215789795, "rewards/fixed_code_pass_all_test_reward/mean": 0.6015625, "rewards/fixed_code_pass_all_test_reward/std": 0.4418628215789795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2060505441800406, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.017282933928072453, "learning_rate": 1.3718500307314076e-05, "loss": 0.0007, "num_tokens": 9157656.0, "reward": 1.3515625, "reward_std": 0.29112139344215393, "rewards/fixed_code_pass_all_test_reward/mean": 0.3515625, "rewards/fixed_code_pass_all_test_reward/std": 0.2911214232444763, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 242.75, "completions/mean_terminated_length": 242.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.20623501199040767, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.057335107238031924, "learning_rate": 1.373079287031346e-05, "loss": 0.0023, "num_tokens": 9167014.0, "reward": 2.554166793823242, "reward_std": 0.4876042902469635, "rewards/fixed_code_pass_all_test_reward/mean": 0.637499988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.5012484192848206, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.20641947980077477, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.057582609821110964, "learning_rate": 1.3743085433312846e-05, "loss": 0.0023, "num_tokens": 9176889.0, "reward": 1.4576923847198486, "reward_std": 1.0182361602783203, "rewards/fixed_code_pass_all_test_reward/mean": 0.3701923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.47593727707862854, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3375000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23260943591594696, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.20660394761114187, "frac_reward_zero_std": 0.0, "grad_norm": 3.71875, "kl": 0.04023144987877458, "learning_rate": 1.375537799631223e-05, "loss": 0.0016, "num_tokens": 9180715.0, "reward": 1.899999976158142, "reward_std": 0.37032803893089294, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.20678841542150894, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.007752051955321804, "learning_rate": 1.3767670559311617e-05, "loss": 0.0003, "num_tokens": 9187780.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 428.625, "completions/mean_terminated_length": 428.625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.20697288323187604, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.020542138256132603, "learning_rate": 1.3779963122311003e-05, "loss": 0.0008, "num_tokens": 9196201.0, "reward": 1.683333396911621, "reward_std": 0.6166988611221313, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18333332240581512, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21675822138786316, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 326.25, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.20715735104224312, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.021661034203134477, "learning_rate": 1.3792255685310387e-05, "loss": 0.0009, "num_tokens": 9203323.0, "reward": 1.784752368927002, "reward_std": 0.47836169600486755, "rewards/fixed_code_pass_all_test_reward/mean": 0.5923912525177002, "rewards/fixed_code_pass_all_test_reward/std": 0.4378386437892914, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.19236111640930176, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1950204223394394, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.20734181885261022, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.059861536137759686, "learning_rate": 1.3804548248309775e-05, "loss": 0.0024, "num_tokens": 9211607.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.20752628666297732, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0530374696245417, "learning_rate": 1.381684081130916e-05, "loss": 0.0021, "num_tokens": 9216490.0, "reward": 1.8250000476837158, "reward_std": 0.38821935653686523, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08864052593708038, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.2077107544733444, "frac_reward_zero_std": 1.0, "grad_norm": 2.0625, "kl": 0.22712415549904108, "learning_rate": 1.3829133374308545e-05, "loss": 0.0091, "num_tokens": 9224191.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2078952222837115, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.018612428684718907, "learning_rate": 1.3841425937307931e-05, "loss": 0.0007, "num_tokens": 9229285.0, "reward": 2.4000000953674316, "reward_std": 0.2070196121931076, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4000000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20701968669891357, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 87.75, "completions/mean_terminated_length": 87.75, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.2080796900940786, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.2052718847990036, "learning_rate": 1.3853718500307315e-05, "loss": 0.0082, "num_tokens": 9237499.0, "reward": 2.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.20826415790444566, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.06088597699999809, "learning_rate": 1.3866011063306701e-05, "loss": 0.0024, "num_tokens": 9241803.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 67.375, "completions/mean_terminated_length": 67.375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.20844862571481276, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.058948634658008814, "learning_rate": 1.3878303626306087e-05, "loss": 0.0024, "num_tokens": 9245038.0, "reward": 1.1666667461395264, "reward_std": 0.35634833574295044, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.20863309352517986, "frac_reward_zero_std": 1.0, "grad_norm": 0.032470703125, "kl": 0.04209559434093535, "learning_rate": 1.3890596189305472e-05, "loss": 0.0017, "num_tokens": 9253979.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 758.375, "completions/mean_terminated_length": 758.375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.20881756133554694, "frac_reward_zero_std": 0.0, "grad_norm": 0.75390625, "kl": 0.014210961409844458, "learning_rate": 1.3902888752304858e-05, "loss": 0.0006, "num_tokens": 9271094.0, "reward": 1.0714285373687744, "reward_std": 0.13225999474525452, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.1322600245475769, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 108.0, "completions/mean_terminated_length": 108.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.20900202914591404, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.02432733995374292, "learning_rate": 1.3915181315304242e-05, "loss": 0.001, "num_tokens": 9274678.0, "reward": 1.524999976158142, "reward_std": 0.5119988918304443, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.20918649695628114, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.04328645416535437, "learning_rate": 1.3927473878303628e-05, "loss": 0.0017, "num_tokens": 9283299.0, "reward": 1.816666603088379, "reward_std": 0.24006974697113037, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2298911213874817, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 172.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.2093709647666482, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05760317808017135, "learning_rate": 1.3939766441303014e-05, "loss": 0.0023, "num_tokens": 9292204.0, "reward": 2.606250047683716, "reward_std": 0.31217843294143677, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.1602174937725067, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.824999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24348656833171844, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.2095554325770153, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.05212048953399062, "learning_rate": 1.3952059004302398e-05, "loss": 0.0021, "num_tokens": 9299434.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.2097399003873824, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.09409910487011075, "learning_rate": 1.3964351567301784e-05, "loss": 0.0038, "num_tokens": 9308211.0, "reward": 1.6111111640930176, "reward_std": 0.6588496565818787, "rewards/fixed_code_pass_all_test_reward/mean": 0.7361111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.3720859885215759, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.20992436819774948, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.0606194066349417, "learning_rate": 1.3976644130301168e-05, "loss": 0.0024, "num_tokens": 9317349.0, "reward": 1.9166667461395264, "reward_std": 0.3883216083049774, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39591163396835327, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.21010883600811658, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.022758017061278224, "learning_rate": 1.3988936693300554e-05, "loss": 0.0009, "num_tokens": 9322107.0, "reward": 2.075000047683716, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2121320515871048, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.21029330381848368, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.046210985630750656, "learning_rate": 1.400122925629994e-05, "loss": 0.0018, "num_tokens": 9328541.0, "reward": 2.825000047683716, "reward_std": 0.3412163555622101, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.887499988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3181980550289154, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.21047777162885076, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.09036805760115385, "learning_rate": 1.4013521819299325e-05, "loss": 0.0036, "num_tokens": 9335366.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.21066223943921786, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.09423051960766315, "learning_rate": 1.402581438229871e-05, "loss": 0.0038, "num_tokens": 9342116.0, "reward": 2.4166667461395264, "reward_std": 0.49601587653160095, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.21084670724958496, "frac_reward_zero_std": 1.0, "grad_norm": 2.375, "kl": 0.14315444836393, "learning_rate": 1.4038106945298095e-05, "loss": 0.0057, "num_tokens": 9351181.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.21103117505995203, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.02833326382096857, "learning_rate": 1.4050399508297481e-05, "loss": 0.0011, "num_tokens": 9358025.0, "reward": 1.3666666746139526, "reward_std": 0.39198315143585205, "rewards/fixed_code_pass_all_test_reward/mean": 0.36666667461395264, "rewards/fixed_code_pass_all_test_reward/std": 0.39198318123817444, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.21121564287031913, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.02343742491211742, "learning_rate": 1.4062692071296867e-05, "loss": 0.0009, "num_tokens": 9364680.0, "reward": 1.6064393520355225, "reward_std": 0.3792549669742584, "rewards/fixed_code_pass_all_test_reward/mean": 0.47727274894714355, "rewards/fixed_code_pass_all_test_reward/std": 0.455194354057312, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12916666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18033258616924286, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.21140011068068623, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.07706406619399786, "learning_rate": 1.4074984634296251e-05, "loss": 0.0031, "num_tokens": 9370372.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.2115845784910533, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.014996366866398603, "learning_rate": 1.4087277197295637e-05, "loss": 0.0006, "num_tokens": 9377585.0, "reward": 2.377976179122925, "reward_std": 0.18006747961044312, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3779761791229248, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18006743490695953, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 403.0, "completions/mean_terminated_length": 403.0, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.2117690463014204, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.024990244302898645, "learning_rate": 1.4099569760295022e-05, "loss": 0.001, "num_tokens": 9389217.0, "reward": 1.3541666269302368, "reward_std": 0.440395712852478, "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.4124789535999298, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 183.875, "completions/mean_terminated_length": 183.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.2119535141117875, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.029750691959634423, "learning_rate": 1.4111862323294408e-05, "loss": 0.0012, "num_tokens": 9397968.0, "reward": 2.0208332538604736, "reward_std": 0.49149516224861145, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8958333134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.294627845287323, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.21213798192215458, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.03636358003132045, "learning_rate": 1.4124154886293792e-05, "loss": 0.0015, "num_tokens": 9406573.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 615.125, "completions/mean_terminated_length": 410.4285888671875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.21232244973252168, "frac_reward_zero_std": 0.0, "grad_norm": 0.625, "kl": 0.015165937482379377, "learning_rate": 1.4136447449293178e-05, "loss": 0.0006, "num_tokens": 9418574.0, "reward": 1.4270832538604736, "reward_std": 0.6555114388465881, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.06305556744337082, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4895833432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36307084560394287, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 565.75, "completions/mean_terminated_length": 565.75, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.21250691754288878, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.015615351847372949, "learning_rate": 1.4148740012292564e-05, "loss": 0.0006, "num_tokens": 9429468.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.21269138535325585, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.025064941844902933, "learning_rate": 1.4161032575291948e-05, "loss": 0.001, "num_tokens": 9437986.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.21287585316362295, "frac_reward_zero_std": 0.0, "grad_norm": 4.9375, "kl": 0.33260808628983796, "learning_rate": 1.4173325138291334e-05, "loss": 0.0133, "num_tokens": 9444655.0, "reward": 1.8072917461395264, "reward_std": 0.5112512111663818, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0572916679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11980784684419632, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.21306032097399003, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.038573769852519035, "learning_rate": 1.4185617701290719e-05, "loss": 0.0015, "num_tokens": 9449081.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.21324478878435713, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.08000516309402883, "learning_rate": 1.4197910264290105e-05, "loss": 0.0032, "num_tokens": 9455925.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 323.875, "completions/mean_terminated_length": 323.875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.21342925659472423, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.0042074068551301025, "learning_rate": 1.4210202827289492e-05, "loss": 0.0002, "num_tokens": 9462508.0, "reward": 1.9861111640930176, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.9861111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 455.875, "completions/mean_terminated_length": 228.4285888671875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.2136137244050913, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.0597014541272074, "learning_rate": 1.4222495390288877e-05, "loss": 0.0024, "num_tokens": 9486939.0, "reward": 1.6785643100738525, "reward_std": 0.725792646408081, "rewards/fixed_code_pass_all_test_reward/mean": 0.599397599697113, "rewards/fixed_code_pass_all_test_reward/std": 0.32595396041870117, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20416668057441711, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15164443850517273, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 454.25, "completions/mean_terminated_length": 454.25, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.2137981922154584, "frac_reward_zero_std": 0.0, "grad_norm": 0.439453125, "kl": 0.02991547284182161, "learning_rate": 1.4234787953288263e-05, "loss": 0.0012, "num_tokens": 9498389.0, "reward": 1.9462963342666626, "reward_std": 0.16524378955364227, "rewards/fixed_code_pass_all_test_reward/mean": 0.6296296119689941, "rewards/fixed_code_pass_all_test_reward/std": 0.19797129929065704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3166666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0471404567360878, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2139826600258255, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.05742072348948568, "learning_rate": 1.4247080516287649e-05, "loss": 0.0023, "num_tokens": 9502626.0, "reward": 2.340625047683716, "reward_std": 0.15465256571769714, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.34062498807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15465255081653595, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 236.0, "completions/mean_terminated_length": 236.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.21416712783619257, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.04734270926564932, "learning_rate": 1.4259373079287033e-05, "loss": 0.0019, "num_tokens": 9511098.0, "reward": 1.8333333730697632, "reward_std": 0.23038095235824585, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.23038096725940704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 786.625, "completions/mean_terminated_length": 786.625, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.21435159564655967, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.021716597490012646, "learning_rate": 1.4271665642286419e-05, "loss": 0.0009, "num_tokens": 9528399.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.21453606345692677, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.0311652107629925, "learning_rate": 1.4283958205285803e-05, "loss": 0.0012, "num_tokens": 9532898.0, "reward": 1.9749999046325684, "reward_std": 0.40620189905166626, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10690450668334961, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 363.75, "completions/mean_terminated_length": 363.75, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.21472053126729385, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.03534514596685767, "learning_rate": 1.429625076828519e-05, "loss": 0.0014, "num_tokens": 9543856.0, "reward": 1.3227272033691406, "reward_std": 0.7134895324707031, "rewards/fixed_code_pass_all_test_reward/mean": 0.022727273404598236, "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.42500001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.47132036089897156, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.21490499907766095, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.019374533789232373, "learning_rate": 1.4308543331284575e-05, "loss": 0.0008, "num_tokens": 9548842.0, "reward": 2.049999952316284, "reward_std": 0.09258199483156204, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.21508946688802805, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.09402737719938159, "learning_rate": 1.432083589428396e-05, "loss": 0.0038, "num_tokens": 9556765.0, "reward": 2.0, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.21527393469839512, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.023585075279697776, "learning_rate": 1.4333128457283345e-05, "loss": 0.0009, "num_tokens": 9562447.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 237.25, "completions/mean_terminated_length": 237.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.21545840250876222, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.03626878187060356, "learning_rate": 1.434542102028273e-05, "loss": 0.0015, "num_tokens": 9571321.0, "reward": 1.8216032981872559, "reward_std": 0.30284926295280457, "rewards/fixed_code_pass_all_test_reward/mean": 0.41847825050354004, "rewards/fixed_code_pass_all_test_reward/std": 0.23232845962047577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.40312498807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1537841111421585, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 121.25, "completions/mean_terminated_length": 121.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.21564287031912932, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.08285041432827711, "learning_rate": 1.4357713583282116e-05, "loss": 0.0033, "num_tokens": 9579315.0, "reward": 2.196272134780884, "reward_std": 0.009304000996053219, "rewards/fixed_code_pass_all_test_reward/mean": 0.5296052694320679, "rewards/fixed_code_pass_all_test_reward/std": 0.009304022416472435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2158273381294964, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.032383181038312614, "learning_rate": 1.4370006146281502e-05, "loss": 0.0013, "num_tokens": 9583134.0, "reward": 2.5, "reward_std": 0.15118566155433655, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1511858105659485, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2160118059398635, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.05525679560378194, "learning_rate": 1.4382298709280886e-05, "loss": 0.0022, "num_tokens": 9591620.0, "reward": 2.1458334922790527, "reward_std": 0.325992614030838, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.32599255442619324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.2161962737502306, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.046169230714440346, "learning_rate": 1.4394591272280272e-05, "loss": 0.0018, "num_tokens": 9595247.0, "reward": 2.2083332538604736, "reward_std": 1.0991699695587158, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5833333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39319610595703125, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 322.5, "completions/mean_terminated_length": 322.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.21638074156059767, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.04332212673034519, "learning_rate": 1.4406883835279656e-05, "loss": 0.0017, "num_tokens": 9606387.0, "reward": 1.7870371341705322, "reward_std": 0.37731581926345825, "rewards/fixed_code_pass_all_test_reward/mean": 0.45370370149612427, "rewards/fixed_code_pass_all_test_reward/std": 0.37731584906578064, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.21656520937096477, "frac_reward_zero_std": 0.0, "grad_norm": 0.5390625, "kl": 0.02765336271841079, "learning_rate": 1.4419176398279042e-05, "loss": 0.0011, "num_tokens": 9613733.0, "reward": 2.213435173034668, "reward_std": 0.027677636593580246, "rewards/fixed_code_pass_all_test_reward/mean": 0.8801020383834839, "rewards/fixed_code_pass_all_test_reward/std": 0.027677593752741814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.21674967718133187, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.025900381500832736, "learning_rate": 1.4431468961278428e-05, "loss": 0.001, "num_tokens": 9618668.0, "reward": 2.75, "reward_std": 0.27774596214294434, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2777460217475891, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 314.625, "completions/mean_terminated_length": 314.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.21693414499169894, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.02101409458555281, "learning_rate": 1.4443761524277813e-05, "loss": 0.0008, "num_tokens": 9625577.0, "reward": 1.399999976158142, "reward_std": 0.6502746343612671, "rewards/fixed_code_pass_all_test_reward/mean": 0.4833333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.38089287281036377, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.21711861280206604, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.03856931871268898, "learning_rate": 1.4456054087277199e-05, "loss": 0.0015, "num_tokens": 9630103.0, "reward": 1.7755953073501587, "reward_std": 0.5662888884544373, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4005952477455139, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15298430621623993, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 413.375, "completions/mean_terminated_length": 413.375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.21730308061243314, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.01370009290985763, "learning_rate": 1.4468346650276583e-05, "loss": 0.0005, "num_tokens": 9637770.0, "reward": 1.0982322692871094, "reward_std": 0.4887063205242157, "rewards/fixed_code_pass_all_test_reward/mean": 0.04545454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.08416546881198883, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17777778208255768, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19876161217689514, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.21748754842280021, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.02840018819551915, "learning_rate": 1.4480639213275969e-05, "loss": 0.0011, "num_tokens": 9644938.0, "reward": 1.6982557773590088, "reward_std": 0.49580758810043335, "rewards/fixed_code_pass_all_test_reward/mean": 0.7732558250427246, "rewards/fixed_code_pass_all_test_reward/std": 0.2304723560810089, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.21767201623316731, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.050956722581759095, "learning_rate": 1.4492931776275353e-05, "loss": 0.002, "num_tokens": 9653140.0, "reward": 2.0, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 701.25, "completions/mean_terminated_length": 508.857177734375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.21785648404353442, "frac_reward_zero_std": 0.0, "grad_norm": 0.671875, "kl": 0.01766914257314056, "learning_rate": 1.450522433927474e-05, "loss": 0.0007, "num_tokens": 9664254.0, "reward": 1.493749976158142, "reward_std": 0.6997129321098328, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.39949744939804077, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 306.125, "completions/mean_terminated_length": 306.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2180409518539015, "frac_reward_zero_std": 0.0, "grad_norm": 0.6328125, "kl": 0.03455129834765103, "learning_rate": 1.4517516902274125e-05, "loss": 0.0014, "num_tokens": 9670687.0, "reward": 1.9833333492279053, "reward_std": 0.5057604908943176, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07000567018985748, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 128.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2182254196642686, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.05726018315181136, "learning_rate": 1.452980946527351e-05, "loss": 0.0023, "num_tokens": 9674532.0, "reward": 1.9886903762817383, "reward_std": 0.4083350598812103, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11369048058986664, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09620049595832825, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 568.0, "completions/mean_terminated_length": 568.0, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.2184098874746357, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.01493296172702685, "learning_rate": 1.4542102028272896e-05, "loss": 0.0006, "num_tokens": 9684436.0, "reward": 1.3725961446762085, "reward_std": 0.30853894352912903, "rewards/fixed_code_pass_all_test_reward/mean": 0.3725961446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.30853894352912903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 208.25, "completions/mean_terminated_length": 208.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.21859435528500276, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.021993425325490534, "learning_rate": 1.455439459127228e-05, "loss": 0.0009, "num_tokens": 9689542.0, "reward": 1.0687499046325684, "reward_std": 0.5243618488311768, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.19375000894069672, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.307568222284317, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 324.375, "completions/mean_terminated_length": 324.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.21877882309536986, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.038478161790408194, "learning_rate": 1.4566687154271666e-05, "loss": 0.0015, "num_tokens": 9698873.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.21896329090573696, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.03475380386225879, "learning_rate": 1.4578979717271052e-05, "loss": 0.0014, "num_tokens": 9705872.0, "reward": 1.5718084573745728, "reward_std": 0.46244707703590393, "rewards/fixed_code_pass_all_test_reward/mean": 0.6968085169792175, "rewards/fixed_code_pass_all_test_reward/std": 0.41890740394592285, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 318.25, "completions/mean_terminated_length": 318.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.21914775871610404, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.024227620044257492, "learning_rate": 1.4591272280270436e-05, "loss": 0.001, "num_tokens": 9712786.0, "reward": 1.884920597076416, "reward_std": 0.21751579642295837, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.17908091843128204, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02777777798473835, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.07856741547584534, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.21933222652647114, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.029959874926134944, "learning_rate": 1.4603564843269824e-05, "loss": 0.0012, "num_tokens": 9716783.0, "reward": 2.820535659790039, "reward_std": 0.08506879210472107, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8205357193946838, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08506879210472107, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2195166943368382, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.03967099648434669, "learning_rate": 1.461585740626921e-05, "loss": 0.0016, "num_tokens": 9720720.0, "reward": 1.9041666984558105, "reward_std": 0.14847101271152496, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9041666984558105, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14847104251384735, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.2197011621472053, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.031806957675144076, "learning_rate": 1.4628149969268594e-05, "loss": 0.0013, "num_tokens": 9725515.0, "reward": 2.8625001907348633, "reward_std": 0.176776722073555, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8624999523162842, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 427.375, "completions/mean_terminated_length": 427.375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.2198856299575724, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.026033747126348317, "learning_rate": 1.464044253226798e-05, "loss": 0.001, "num_tokens": 9734566.0, "reward": 1.2125000953674316, "reward_std": 0.13562026619911194, "rewards/fixed_code_pass_all_test_reward/mean": 0.21250000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.13562026619911194, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 395.625, "completions/mean_terminated_length": 395.625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.22007009776793948, "frac_reward_zero_std": 0.0, "grad_norm": 0.50390625, "kl": 0.026275131735019386, "learning_rate": 1.4652735095267365e-05, "loss": 0.0011, "num_tokens": 9746195.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 258.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.22025456557830658, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.08775123907253146, "learning_rate": 1.466502765826675e-05, "loss": 0.0035, "num_tokens": 9752321.0, "reward": 1.334280252456665, "reward_std": 0.6853496432304382, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20928031206130981, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34465089440345764, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 348.0, "completions/mean_terminated_length": 348.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.22043903338867368, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.11544510524254292, "learning_rate": 1.4677320221266137e-05, "loss": 0.0046, "num_tokens": 9763225.0, "reward": 2.011516571044922, "reward_std": 0.3774510622024536, "rewards/fixed_code_pass_all_test_reward/mean": 0.7228260636329651, "rewards/fixed_code_pass_all_test_reward/std": 0.29206582903862, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2886904776096344, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15081976354122162, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.22062350119904076, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.04595494514796883, "learning_rate": 1.468961278426552e-05, "loss": 0.0018, "num_tokens": 9767971.0, "reward": 1.8208333253860474, "reward_std": 0.5997188687324524, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.19583332538604736, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25846773386001587, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.22080796900940786, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.05031262570992112, "learning_rate": 1.4701905347264907e-05, "loss": 0.002, "num_tokens": 9771878.0, "reward": 1.774999976158142, "reward_std": 0.7206147909164429, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 412.875, "completions/mean_terminated_length": 412.875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.22099243681977496, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.025057791848666966, "learning_rate": 1.4714197910264291e-05, "loss": 0.001, "num_tokens": 9780901.0, "reward": 1.21875, "reward_std": 0.34196415543556213, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34196415543556213, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.22117690463014203, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.03154367965180427, "learning_rate": 1.4726490473263677e-05, "loss": 0.0013, "num_tokens": 9784975.0, "reward": 2.4749999046325684, "reward_std": 0.5119988918304443, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4749999940395355, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5119988918304443, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 289.375, "completions/mean_terminated_length": 289.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.22136137244050913, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.02696843951707706, "learning_rate": 1.4738783036263063e-05, "loss": 0.0011, "num_tokens": 9794826.0, "reward": 2.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.22154584025087623, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.06470653554424644, "learning_rate": 1.4751075599262447e-05, "loss": 0.0026, "num_tokens": 9802636.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2217303080612433, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.05916286981664598, "learning_rate": 1.4763368162261833e-05, "loss": 0.0024, "num_tokens": 9808633.0, "reward": 1.4184523820877075, "reward_std": 0.5544322729110718, "rewards/fixed_code_pass_all_test_reward/mean": 0.2976190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.4377327859401703, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12083333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14685864746570587, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 227.25, "completions/mean_terminated_length": 227.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.2219147758716104, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.048973285825923085, "learning_rate": 1.4775660725261218e-05, "loss": 0.002, "num_tokens": 9814683.0, "reward": 1.9249999523162842, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.2220992436819775, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.039879842195659876, "learning_rate": 1.4787953288260604e-05, "loss": 0.0016, "num_tokens": 9819719.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 563.75, "completions/mean_terminated_length": 563.75, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.22228371149234458, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.01790644822176546, "learning_rate": 1.480024585125999e-05, "loss": 0.0007, "num_tokens": 9830461.0, "reward": 1.2916667461395264, "reward_std": 0.1178511381149292, "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.22246817930271168, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.022302513476461172, "learning_rate": 1.4812538414259374e-05, "loss": 0.0009, "num_tokens": 9834588.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.22265264711307878, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.04538067348767072, "learning_rate": 1.482483097725876e-05, "loss": 0.0018, "num_tokens": 9843860.0, "reward": 2.222916603088379, "reward_std": 0.4002913236618042, "rewards/fixed_code_pass_all_test_reward/mean": 0.6145833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.42941799759864807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6083333492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1649916023015976, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.22283711492344585, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.07895399327389896, "learning_rate": 1.4837123540258144e-05, "loss": 0.0032, "num_tokens": 9849249.0, "reward": 2.375, "reward_std": 1.1877349615097046, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.22302158273381295, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.03711604664567858, "learning_rate": 1.484941610325753e-05, "loss": 0.0015, "num_tokens": 9860052.0, "reward": 1.9166667461395264, "reward_std": 0.5841830372810364, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39591163396835327, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.22320605054418005, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.06629605311900377, "learning_rate": 1.4861708666256915e-05, "loss": 0.0027, "num_tokens": 9863858.0, "reward": 2.4375, "reward_std": 1.0500850677490234, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.22339051835454712, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.08878782531246543, "learning_rate": 1.48740012292563e-05, "loss": 0.0036, "num_tokens": 9870465.0, "reward": 1.76630437374115, "reward_std": 0.32779058814048767, "rewards/fixed_code_pass_all_test_reward/mean": 0.8913043737411499, "rewards/fixed_code_pass_all_test_reward/std": 0.11620054394006729, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.22357498616491422, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.047053264221176505, "learning_rate": 1.4886293792255687e-05, "loss": 0.0019, "num_tokens": 9876239.0, "reward": 1.6528409719467163, "reward_std": 0.14778181910514832, "rewards/fixed_code_pass_all_test_reward/mean": 0.6278408765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.1154215857386589, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.22375945397528132, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.06682306167203933, "learning_rate": 1.4898586355255071e-05, "loss": 0.0027, "num_tokens": 9886483.0, "reward": 2.0452804565429688, "reward_std": 0.15895912051200867, "rewards/fixed_code_pass_all_test_reward/mean": 0.07653061300516129, "rewards/fixed_code_pass_all_test_reward/std": 0.10949453711509705, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.2239439217856484, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.04350253217853606, "learning_rate": 1.4910878918254457e-05, "loss": 0.0017, "num_tokens": 9895282.0, "reward": 2.12890625, "reward_std": 0.27520158886909485, "rewards/fixed_code_pass_all_test_reward/mean": 0.31640625, "rewards/fixed_code_pass_all_test_reward/std": 0.2762135863304138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1157275140285492, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 45.125, "completions/mean_terminated_length": 45.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.2241283895960155, "frac_reward_zero_std": 0.0, "grad_norm": 3.890625, "kl": 0.19459351245313883, "learning_rate": 1.4923171481253841e-05, "loss": 0.0078, "num_tokens": 9898347.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 66.625, "completions/mean_terminated_length": 66.625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.2243128574063826, "frac_reward_zero_std": 0.0, "grad_norm": 4.125, "kl": 0.14899244159460068, "learning_rate": 1.4935464044253227e-05, "loss": 0.006, "num_tokens": 9901744.0, "reward": 1.2916667461395264, "reward_std": 0.4520675837993622, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4520675837993622, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.22449732521674967, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.06884783180430532, "learning_rate": 1.4947756607252613e-05, "loss": 0.0028, "num_tokens": 9907352.0, "reward": 1.0535714626312256, "reward_std": 0.0739356130361557, "rewards/fixed_code_pass_all_test_reward/mean": 0.0535714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 284.75, "completions/mean_terminated_length": 284.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.22468179302711677, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.025908364623319358, "learning_rate": 1.4960049170251998e-05, "loss": 0.001, "num_tokens": 9913838.0, "reward": 1.875, "reward_std": 0.14880472421646118, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.14880476891994476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 102.25, "completions/mean_terminated_length": 102.25, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.22486626083748387, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.0813440689817071, "learning_rate": 1.4972341733251384e-05, "loss": 0.0033, "num_tokens": 9917528.0, "reward": 2.6083335876464844, "reward_std": 0.3384135961532593, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6083333492279053, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.33841368556022644, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.22505072864785094, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.10561477928422391, "learning_rate": 1.4984634296250768e-05, "loss": 0.0042, "num_tokens": 9926492.0, "reward": 2.3333334922790527, "reward_std": 0.3563482463359833, "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3155997097492218, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 371.375, "completions/mean_terminated_length": 371.375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.22523519645821805, "frac_reward_zero_std": 0.0, "grad_norm": 0.48046875, "kl": 0.03262713167350739, "learning_rate": 1.4996926859250154e-05, "loss": 0.0013, "num_tokens": 9937279.0, "reward": 1.9242424964904785, "reward_std": 0.04285498708486557, "rewards/fixed_code_pass_all_test_reward/mean": 0.924242377281189, "rewards/fixed_code_pass_all_test_reward/std": 0.04285496100783348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.22541966426858512, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.06668349308893085, "learning_rate": 1.5009219422249542e-05, "loss": 0.0027, "num_tokens": 9941722.0, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 205.0, "completions/mean_terminated_length": 205.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.22560413207895222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.03966970951296389, "learning_rate": 1.5021511985248926e-05, "loss": 0.0016, "num_tokens": 9949746.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 416.375, "completions/mean_terminated_length": 416.375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.22578859988931932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0400390625, "kl": 0.0312633600551635, "learning_rate": 1.5033804548248312e-05, "loss": 0.0013, "num_tokens": 9964189.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2259730676996864, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.0685603260062635, "learning_rate": 1.5046097111247698e-05, "loss": 0.0027, "num_tokens": 9968261.0, "reward": 2.0625, "reward_std": 0.5629958510398865, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 219.5, "completions/mean_terminated_length": 219.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.2261575355100535, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.038074787240475416, "learning_rate": 1.5058389674247082e-05, "loss": 0.0015, "num_tokens": 9974009.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.2263420033204206, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0932351746596396, "learning_rate": 1.5070682237246468e-05, "loss": 0.0037, "num_tokens": 9983880.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 355.375, "completions/mean_terminated_length": 355.375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.22652647113078767, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.04013463272713125, "learning_rate": 1.5082974800245852e-05, "loss": 0.0016, "num_tokens": 9991955.0, "reward": 1.6785714626312256, "reward_std": 0.2503642141819, "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.25036418437957764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.22671093894115477, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.13242831360548735, "learning_rate": 1.5095267363245238e-05, "loss": 0.0053, "num_tokens": 9999015.0, "reward": 1.8323862552642822, "reward_std": 0.19922278821468353, "rewards/fixed_code_pass_all_test_reward/mean": 0.8323863744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.19922277331352234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.22689540675152187, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.10718176863156259, "learning_rate": 1.5107559926244624e-05, "loss": 0.0043, "num_tokens": 10009067.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 81.5, "completions/mean_terminated_length": 81.5, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.22707987456188894, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.12663219263777137, "learning_rate": 1.5119852489244009e-05, "loss": 0.0051, "num_tokens": 10012535.0, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.22726434237225604, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.1916047940030694, "learning_rate": 1.5132145052243395e-05, "loss": 0.0077, "num_tokens": 10020930.0, "reward": 2.629687547683716, "reward_std": 0.3278506398200989, "rewards/fixed_code_pass_all_test_reward/mean": 0.8671875, "rewards/fixed_code_pass_all_test_reward/std": 0.24592097103595734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.762499988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34511902928352356, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 410.125, "completions/mean_terminated_length": 176.1428680419922, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.22744881018262314, "frac_reward_zero_std": 0.0, "grad_norm": 0.70703125, "kl": 0.07522600125957979, "learning_rate": 1.5144437615242779e-05, "loss": 0.003, "num_tokens": 10028291.0, "reward": 1.0416667461395264, "reward_std": 0.5473601818084717, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35634833574295044, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.2276332779929902, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.08846958773210645, "learning_rate": 1.5156730178242165e-05, "loss": 0.0035, "num_tokens": 10034630.0, "reward": 1.75, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2278177458033573, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.0883655110374093, "learning_rate": 1.5169022741241551e-05, "loss": 0.0035, "num_tokens": 10042887.0, "reward": 1.9609375, "reward_std": 0.04175956919789314, "rewards/fixed_code_pass_all_test_reward/mean": 0.9609375, "rewards/fixed_code_pass_all_test_reward/std": 0.04175956919789314, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.2280022136137244, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.05675741331651807, "learning_rate": 1.5181315304240935e-05, "loss": 0.0023, "num_tokens": 10048347.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.2281866814240915, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0982600748538971, "learning_rate": 1.5193607867240321e-05, "loss": 0.0039, "num_tokens": 10056673.0, "reward": 2.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2283711492344586, "frac_reward_zero_std": 1.0, "grad_norm": 0.2333984375, "kl": 0.10962901310995221, "learning_rate": 1.5205900430239706e-05, "loss": 0.0044, "num_tokens": 10062003.0, "reward": 1.0860215425491333, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.08602150529623032, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2285556170448257, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.0729299746453762, "learning_rate": 1.5218192993239092e-05, "loss": 0.0029, "num_tokens": 10069778.0, "reward": 1.9249999523162842, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.925000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2121320217847824, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 242.25, "completions/mean_terminated_length": 242.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.22874008485519276, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.06863168883137405, "learning_rate": 1.5230485556238476e-05, "loss": 0.0027, "num_tokens": 10079132.0, "reward": 1.6780303716659546, "reward_std": 0.7030834555625916, "rewards/fixed_code_pass_all_test_reward/mean": 0.8030303120613098, "rewards/fixed_code_pass_all_test_reward/std": 0.374017596244812, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.22892455266555986, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.03605798096396029, "learning_rate": 1.5242778119237862e-05, "loss": 0.0014, "num_tokens": 10086320.0, "reward": 1.8273236751556396, "reward_std": 0.3445238769054413, "rewards/fixed_code_pass_all_test_reward/mean": 0.759615421295166, "rewards/fixed_code_pass_all_test_reward/std": 0.3088918924331665, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0677083358168602, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09944663196802139, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 185.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.22910902047592696, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.11166064674034715, "learning_rate": 1.5255070682237248e-05, "loss": 0.0045, "num_tokens": 10094622.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 67.0, "completions/mean_terminated_length": 67.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.22929348828629403, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.0887457737699151, "learning_rate": 1.5267363245236632e-05, "loss": 0.0035, "num_tokens": 10097822.0, "reward": 2.125, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.22947795609666113, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.07078945497050881, "learning_rate": 1.527965580823602e-05, "loss": 0.0028, "num_tokens": 10103821.0, "reward": 2.027777671813965, "reward_std": 0.5203105807304382, "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.2375655323266983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 440.75, "completions/mean_terminated_length": 440.75, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.22966242390702823, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.040089664282277226, "learning_rate": 1.5291948371235404e-05, "loss": 0.0016, "num_tokens": 10113995.0, "reward": 1.5958333015441895, "reward_std": 0.5069164037704468, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333283662796, "rewards/fixed_code_pass_all_test_reward/std": 0.3468394875526428, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.44999998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.49856939911842346, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 104.0, "completions/mean_terminated_length": 104.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.2298468917173953, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.24843186605721712, "learning_rate": 1.530424093423479e-05, "loss": 0.0099, "num_tokens": 10120603.0, "reward": 2.589583396911621, "reward_std": 0.46281898021698, "rewards/fixed_code_pass_all_test_reward/mean": 0.9333333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.18856181204319, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.65625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.48065248131752014, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.2300313595277624, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.10786373540759087, "learning_rate": 1.5316533497234173e-05, "loss": 0.0043, "num_tokens": 10131623.0, "reward": 2.05378794670105, "reward_std": 0.40154746174812317, "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, "rewards/fixed_code_pass_all_test_reward/std": 0.12856487929821014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7583333253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3786986768245697, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 88.375, "completions/mean_terminated_length": 88.375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.2302158273381295, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.1578440871089697, "learning_rate": 1.532882606023356e-05, "loss": 0.0063, "num_tokens": 10135858.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 115.125, "completions/mean_terminated_length": 115.125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.23040029514849658, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.09147543925791979, "learning_rate": 1.5341118623232945e-05, "loss": 0.0037, "num_tokens": 10139843.0, "reward": 2.3125, "reward_std": 0.7989948987960815, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.23058476295886368, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.04256641608662903, "learning_rate": 1.535341118623233e-05, "loss": 0.0017, "num_tokens": 10144640.0, "reward": 2.84375, "reward_std": 0.22903135418891907, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.23076923076923078, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.0828013620339334, "learning_rate": 1.5365703749231714e-05, "loss": 0.0033, "num_tokens": 10150365.0, "reward": 1.2771738767623901, "reward_std": 0.3674124479293823, "rewards/fixed_code_pass_all_test_reward/mean": 0.15217390656471252, "rewards/fixed_code_pass_all_test_reward/std": 0.061487551778554916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 130.375, "completions/mean_terminated_length": 130.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.23095369857959785, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.14295411156490445, "learning_rate": 1.53779963122311e-05, "loss": 0.0057, "num_tokens": 10158472.0, "reward": 1.8184524774551392, "reward_std": 0.5705687403678894, "rewards/fixed_code_pass_all_test_reward/mean": 0.3392857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.4322873651981354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4791666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.46664541959762573, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.23113816638996496, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.0417405494954437, "learning_rate": 1.5390288875230486e-05, "loss": 0.0017, "num_tokens": 10164721.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.23132263420033206, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.06692950474098325, "learning_rate": 1.5402581438229875e-05, "loss": 0.0027, "num_tokens": 10174084.0, "reward": 1.6666667461395264, "reward_std": 0.471404492855072, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5416666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.501980185508728, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.23150710201069913, "frac_reward_zero_std": 1.0, "grad_norm": 0.4296875, "kl": 0.07846394460648298, "learning_rate": 1.5414874001229258e-05, "loss": 0.0031, "num_tokens": 10180009.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.23169156982106623, "frac_reward_zero_std": 1.0, "grad_norm": 0.2734375, "kl": 0.12466197554022074, "learning_rate": 1.5427166564228644e-05, "loss": 0.005, "num_tokens": 10186990.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.2318760376314333, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.11014496069401503, "learning_rate": 1.543945912722803e-05, "loss": 0.0044, "num_tokens": 10194275.0, "reward": 2.933333396911621, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9333333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 185.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.2320605054418004, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0586754553951323, "learning_rate": 1.5451751690227416e-05, "loss": 0.0023, "num_tokens": 10202105.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2322449732521675, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.08780284319072962, "learning_rate": 1.5464044253226798e-05, "loss": 0.0035, "num_tokens": 10208948.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.23242944106253458, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.10038379486650229, "learning_rate": 1.5476336816226184e-05, "loss": 0.004, "num_tokens": 10215898.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.23261390887290168, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.10579917579889297, "learning_rate": 1.548862937922557e-05, "loss": 0.0042, "num_tokens": 10220926.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.23279837668326878, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.13992334669455886, "learning_rate": 1.5500921942224956e-05, "loss": 0.0056, "num_tokens": 10229182.0, "reward": 1.8611111640930176, "reward_std": 0.32934117317199707, "rewards/fixed_code_pass_all_test_reward/mean": 0.819444477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.2781743109226227, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 95.625, "completions/mean_terminated_length": 95.625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.23298284449363585, "frac_reward_zero_std": 0.0, "grad_norm": 3.734375, "kl": 0.06334250466898084, "learning_rate": 1.5513214505224342e-05, "loss": 0.0025, "num_tokens": 10232699.0, "reward": 0.9375, "reward_std": 0.7763237953186035, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 484.375, "completions/mean_terminated_length": 484.375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.23316731230400295, "frac_reward_zero_std": 0.0, "grad_norm": 0.30859375, "kl": 0.0313690104521811, "learning_rate": 1.5525507068223725e-05, "loss": 0.0013, "num_tokens": 10247262.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.23335178011437005, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.11627043411135674, "learning_rate": 1.553779963122311e-05, "loss": 0.0047, "num_tokens": 10252837.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 116.125, "completions/mean_terminated_length": 116.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.23353624792473712, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.12046443857252598, "learning_rate": 1.5550092194222497e-05, "loss": 0.0048, "num_tokens": 10258982.0, "reward": 2.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.23372071573510422, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.041497390950098634, "learning_rate": 1.5562384757221883e-05, "loss": 0.0017, "num_tokens": 10268852.0, "reward": 1.8465909957885742, "reward_std": 0.06296159327030182, "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.032141219824552536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255690574646, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 89.25, "completions/mean_terminated_length": 89.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.23390518354547132, "frac_reward_zero_std": 1.0, "grad_norm": 0.158203125, "kl": 0.13518108986318111, "learning_rate": 1.557467732022127e-05, "loss": 0.0054, "num_tokens": 10274654.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.2340896513558384, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.14591327542439103, "learning_rate": 1.558696988322065e-05, "loss": 0.0058, "num_tokens": 10284816.0, "reward": 1.9779412746429443, "reward_std": 0.16109485924243927, "rewards/fixed_code_pass_all_test_reward/mean": 0.6966911554336548, "rewards/fixed_code_pass_all_test_reward/std": 0.15078012645244598, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.28125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11732383817434311, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2342741191662055, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.1341602262109518, "learning_rate": 1.5599262446220037e-05, "loss": 0.0054, "num_tokens": 10291815.0, "reward": 1.9892857074737549, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2344585869765726, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.14178382162936032, "learning_rate": 1.5611555009219423e-05, "loss": 0.0057, "num_tokens": 10300866.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 182.0, "completions/mean_terminated_length": 182.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.23464305478693967, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.07825977960601449, "learning_rate": 1.562384757221881e-05, "loss": 0.0031, "num_tokens": 10306274.0, "reward": 1.5520832538604736, "reward_std": 0.16429364681243896, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.05143444612622261, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1354166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19889326393604279, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 257.625, "completions/mean_terminated_length": 257.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.23482752259730677, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.05248287180438638, "learning_rate": 1.5636140135218195e-05, "loss": 0.0021, "num_tokens": 10313007.0, "reward": 2.2604167461395264, "reward_std": 0.6688023805618286, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3854166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.46062737703323364, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.23501199040767387, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.1021359171718359, "learning_rate": 1.5648432698217578e-05, "loss": 0.0041, "num_tokens": 10319400.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 77.75, "completions/mean_terminated_length": 77.75, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.23519645821804094, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.17803855799138546, "learning_rate": 1.5660725261216964e-05, "loss": 0.0071, "num_tokens": 10325406.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.23538092602840804, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.05068395333364606, "learning_rate": 1.567301782421635e-05, "loss": 0.002, "num_tokens": 10331801.0, "reward": 1.7999999523162842, "reward_std": 0.5126959681510925, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 296.125, "completions/mean_terminated_length": 296.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.23556539383877514, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.0508629463147372, "learning_rate": 1.5685310387215736e-05, "loss": 0.002, "num_tokens": 10342914.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 145.125, "completions/mean_terminated_length": 145.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.23574986164914222, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.05360544007271528, "learning_rate": 1.5697602950215122e-05, "loss": 0.0021, "num_tokens": 10350219.0, "reward": 2.59375, "reward_std": 0.376485139131546, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.3764851689338684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.23593432945950932, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.09428183967247605, "learning_rate": 1.5709895513214505e-05, "loss": 0.0038, "num_tokens": 10358582.0, "reward": 2.023648738861084, "reward_std": 0.23989297449588776, "rewards/fixed_code_pass_all_test_reward/mean": 0.27364861965179443, "rewards/fixed_code_pass_all_test_reward/std": 0.02255467139184475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22466906905174255, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.23611879726987642, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0981861399486661, "learning_rate": 1.572218807621389e-05, "loss": 0.0039, "num_tokens": 10366239.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.2363032650802435, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.05891634663566947, "learning_rate": 1.5734480639213277e-05, "loss": 0.0024, "num_tokens": 10373692.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 101.375, "completions/mean_terminated_length": 101.375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.2364877328906106, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.2191311428323388, "learning_rate": 1.5746773202212663e-05, "loss": 0.0088, "num_tokens": 10377527.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 106.25, "completions/mean_terminated_length": 106.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2366722007009777, "frac_reward_zero_std": 1.0, "grad_norm": 0.74609375, "kl": 0.10641193157061934, "learning_rate": 1.575906576521205e-05, "loss": 0.0043, "num_tokens": 10383809.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 133.875, "completions/mean_terminated_length": 133.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.23685666851134476, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.05931228678673506, "learning_rate": 1.577135832821143e-05, "loss": 0.0024, "num_tokens": 10390800.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.23704113632171186, "frac_reward_zero_std": 1.0, "grad_norm": 0.5, "kl": 0.13760091830044985, "learning_rate": 1.5783650891210817e-05, "loss": 0.0055, "num_tokens": 10393926.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 237.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.23722560413207897, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.05125854001380503, "learning_rate": 1.5795943454210203e-05, "loss": 0.0021, "num_tokens": 10400013.0, "reward": 1.3535715341567993, "reward_std": 0.40806373953819275, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.44086670875549316, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06785714626312256, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09529760479927063, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 210.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.23741007194244604, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.025476397946476936, "learning_rate": 1.580823601720959e-05, "loss": 0.001, "num_tokens": 10408994.0, "reward": 2.2083334922790527, "reward_std": 0.24800799787044525, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.24800795316696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 114.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.23759453975281314, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.1223515197634697, "learning_rate": 1.5820528580208975e-05, "loss": 0.0049, "num_tokens": 10416451.0, "reward": 2.2199997901916504, "reward_std": 0.5218647718429565, "rewards/fixed_code_pass_all_test_reward/mean": 0.2824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.4480353891849518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 319.625, "completions/mean_terminated_length": 319.625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.2377790075631802, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.05826577916741371, "learning_rate": 1.583282114320836e-05, "loss": 0.0023, "num_tokens": 10424360.0, "reward": 1.4208333492279053, "reward_std": 0.6351945996284485, "rewards/fixed_code_pass_all_test_reward/mean": 0.5458333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.3500283658504486, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2379634753735473, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.09815360512584448, "learning_rate": 1.5845113706207747e-05, "loss": 0.0039, "num_tokens": 10432728.0, "reward": 1.875, "reward_std": 0.3547983467578888, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.3100321292877197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.2381479431839144, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.14706602413207293, "learning_rate": 1.5857406269207133e-05, "loss": 0.0059, "num_tokens": 10443370.0, "reward": 1.7172619104385376, "reward_std": 0.7047478556632996, "rewards/fixed_code_pass_all_test_reward/mean": 0.008928571827709675, "rewards/fixed_code_pass_all_test_reward/std": 0.025253813713788986, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8333333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3563483655452728, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 141.75, "completions/mean_terminated_length": 141.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.23833241099428149, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0938509670086205, "learning_rate": 1.5869698832206516e-05, "loss": 0.0038, "num_tokens": 10451056.0, "reward": 2.258333206176758, "reward_std": 0.13540062308311462, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.035634830594062805, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.23851687880464859, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.05625411914661527, "learning_rate": 1.5881991395205902e-05, "loss": 0.0023, "num_tokens": 10460443.0, "reward": 2.15625, "reward_std": 0.5815235376358032, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.23870134661501569, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.09022435313090682, "learning_rate": 1.5894283958205288e-05, "loss": 0.0036, "num_tokens": 10465476.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 93.0, "completions/mean_terminated_length": 93.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.23888581442538276, "frac_reward_zero_std": 0.0, "grad_norm": 3.6875, "kl": 0.2329216431826353, "learning_rate": 1.5906576521204674e-05, "loss": 0.0093, "num_tokens": 10470692.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.23907028223574986, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.07896550628356636, "learning_rate": 1.591886908420406e-05, "loss": 0.0032, "num_tokens": 10481014.0, "reward": 1.8916666507720947, "reward_std": 0.13303540647029877, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 107.25, "completions/mean_terminated_length": 107.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.23925475004611696, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.09577811742201447, "learning_rate": 1.5931161647203442e-05, "loss": 0.0038, "num_tokens": 10488968.0, "reward": 1.7864583730697632, "reward_std": 0.7218381762504578, "rewards/fixed_code_pass_all_test_reward/mean": 0.0364583358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.01473139226436615, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 65.5, "completions/mean_terminated_length": 65.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.23943921785648403, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.1045219381339848, "learning_rate": 1.594345421020283e-05, "loss": 0.0042, "num_tokens": 10492380.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 153.375, "completions/mean_terminated_length": 153.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.23962368566685113, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.05728065874427557, "learning_rate": 1.5955746773202214e-05, "loss": 0.0023, "num_tokens": 10499087.0, "reward": 2.887500047683716, "reward_std": 0.21001702547073364, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 334.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.23980815347721823, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.07685505133122206, "learning_rate": 1.59680393362016e-05, "loss": 0.0031, "num_tokens": 10507388.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2399926212875853, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.06989777320995927, "learning_rate": 1.5980331899200986e-05, "loss": 0.0028, "num_tokens": 10512632.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2401770890979524, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0961107974871993, "learning_rate": 1.599262446220037e-05, "loss": 0.0038, "num_tokens": 10517716.0, "reward": 1.6193181276321411, "reward_std": 0.4413256049156189, "rewards/fixed_code_pass_all_test_reward/mean": 0.5568181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.3900700807571411, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 383.875, "completions/mean_terminated_length": 383.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.2403615569083195, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.03923410503193736, "learning_rate": 1.6004917025199755e-05, "loss": 0.0016, "num_tokens": 10530355.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.24054602471868658, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.05682219658046961, "learning_rate": 1.601720958819914e-05, "loss": 0.0023, "num_tokens": 10540339.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.24073049252905368, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.09125853842124343, "learning_rate": 1.6029502151198527e-05, "loss": 0.0037, "num_tokens": 10551334.0, "reward": 2.547619104385376, "reward_std": 0.4419674873352051, "rewards/fixed_code_pass_all_test_reward/mean": 0.7767857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.1711910218000412, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7708333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3666396141052246, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.24091496033942078, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.06288368860259652, "learning_rate": 1.6041794714197913e-05, "loss": 0.0025, "num_tokens": 10560961.0, "reward": 2.3858695030212402, "reward_std": 0.2496960610151291, "rewards/fixed_code_pass_all_test_reward/mean": 0.510869562625885, "rewards/fixed_code_pass_all_test_reward/std": 0.12242486327886581, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 225.375, "completions/mean_terminated_length": 225.375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.24109942814978785, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.06507219979539514, "learning_rate": 1.6054087277197296e-05, "loss": 0.0026, "num_tokens": 10567732.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.24128389596015495, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.055536320665851235, "learning_rate": 1.606637984019668e-05, "loss": 0.0022, "num_tokens": 10577628.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.24146836377052205, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.07884830283001065, "learning_rate": 1.6078672403196068e-05, "loss": 0.0032, "num_tokens": 10585857.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.24165283158088913, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.06676299450919032, "learning_rate": 1.6090964966195454e-05, "loss": 0.0027, "num_tokens": 10591812.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.24183729939125623, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.12352058570832014, "learning_rate": 1.6103257529194836e-05, "loss": 0.0049, "num_tokens": 10595560.0, "reward": 1.5416667461395264, "reward_std": 0.501980185508728, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 111.125, "completions/mean_terminated_length": 111.125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.24202176720162333, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.15383195225149393, "learning_rate": 1.6115550092194222e-05, "loss": 0.0062, "num_tokens": 10601145.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2422062350119904, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.1067897523753345, "learning_rate": 1.6127842655193608e-05, "loss": 0.0043, "num_tokens": 10608701.0, "reward": 2.712890625, "reward_std": 0.38968077301979065, "rewards/fixed_code_pass_all_test_reward/mean": 0.806640625, "rewards/fixed_code_pass_all_test_reward/std": 0.3259320557117462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.2423907028223575, "frac_reward_zero_std": 1.0, "grad_norm": 0.044921875, "kl": 0.02259909827262163, "learning_rate": 1.6140135218192994e-05, "loss": 0.0009, "num_tokens": 10614028.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 59.875, "completions/mean_terminated_length": 59.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.2425751706327246, "frac_reward_zero_std": 0.0, "grad_norm": 4.25, "kl": 0.19194544525817037, "learning_rate": 1.615242778119238e-05, "loss": 0.0077, "num_tokens": 10617515.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 90.375, "completions/mean_terminated_length": 90.375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.24275963844309167, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.08471621805801988, "learning_rate": 1.6164720344191763e-05, "loss": 0.0034, "num_tokens": 10621158.0, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 87.625, "completions/mean_terminated_length": 87.625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.24294410625345877, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.050103817600756884, "learning_rate": 1.617701290719115e-05, "loss": 0.002, "num_tokens": 10624835.0, "reward": 2.8125, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25877460837364197, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.24312857406382588, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.08394070621579885, "learning_rate": 1.6189305470190535e-05, "loss": 0.0034, "num_tokens": 10635803.0, "reward": 2.427884578704834, "reward_std": 0.5710090398788452, "rewards/fixed_code_pass_all_test_reward/mean": 0.521634578704834, "rewards/fixed_code_pass_all_test_reward/std": 0.4350743293762207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 94.875, "completions/mean_terminated_length": 94.875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.24331304187419295, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.12365998607128859, "learning_rate": 1.620159803318992e-05, "loss": 0.0049, "num_tokens": 10643634.0, "reward": 2.390625, "reward_std": 0.5049051642417908, "rewards/fixed_code_pass_all_test_reward/mean": 0.390625, "rewards/fixed_code_pass_all_test_reward/std": 0.5049052834510803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.24349750968456005, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.1531977942213416, "learning_rate": 1.6213890596189307e-05, "loss": 0.0061, "num_tokens": 10651768.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 72.875, "completions/mean_terminated_length": 72.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.24368197749492715, "frac_reward_zero_std": 0.0, "grad_norm": 3.859375, "kl": 0.10881978366523981, "learning_rate": 1.6226183159188693e-05, "loss": 0.0044, "num_tokens": 10655063.0, "reward": 2.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 74.0, "completions/mean_terminated_length": 74.0, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.24386644530529422, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.10943295061588287, "learning_rate": 1.623847572218808e-05, "loss": 0.0044, "num_tokens": 10658663.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 80.125, "completions/mean_terminated_length": 80.125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.24405091311566132, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "kl": 0.09284996660426259, "learning_rate": 1.6250768285187465e-05, "loss": 0.0037, "num_tokens": 10662136.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 79.75, "completions/mean_terminated_length": 79.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.2442353809260284, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.06926728272810578, "learning_rate": 1.6263060848186847e-05, "loss": 0.0028, "num_tokens": 10665734.0, "reward": 1.7916667461395264, "reward_std": 0.7332792282104492, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.49601590633392334, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.2444198487363955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.054769004695117474, "learning_rate": 1.6275353411186233e-05, "loss": 0.0022, "num_tokens": 10670523.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.2446043165467626, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.055091466289013624, "learning_rate": 1.628764597418562e-05, "loss": 0.0022, "num_tokens": 10679597.0, "reward": 1.3563218116760254, "reward_std": 0.5713199973106384, "rewards/fixed_code_pass_all_test_reward/mean": 0.6063218712806702, "rewards/fixed_code_pass_all_test_reward/std": 0.2726793885231018, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 360.0, "completions/mean_terminated_length": 360.0, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.24478878435712967, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.02588184899650514, "learning_rate": 1.6299938537185005e-05, "loss": 0.001, "num_tokens": 10690941.0, "reward": 2.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.24497325216749677, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.06559891416691244, "learning_rate": 1.631223110018439e-05, "loss": 0.0026, "num_tokens": 10696800.0, "reward": 1.7321428060531616, "reward_std": 0.3535534143447876, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 124.25, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.24515771997786387, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.11367764789611101, "learning_rate": 1.6324523663183774e-05, "loss": 0.0045, "num_tokens": 10700746.0, "reward": 2.2083334922790527, "reward_std": 0.6651769280433655, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4714045524597168, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.24534218778823094, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.07895383192226291, "learning_rate": 1.633681622618316e-05, "loss": 0.0032, "num_tokens": 10707541.0, "reward": 1.552884578704834, "reward_std": 0.35006189346313477, "rewards/fixed_code_pass_all_test_reward/mean": 0.6778846383094788, "rewards/fixed_code_pass_all_test_reward/std": 0.04079463332891464, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.24552665559859804, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.08182010287418962, "learning_rate": 1.6349108789182546e-05, "loss": 0.0033, "num_tokens": 10715479.0, "reward": 1.7099359035491943, "reward_std": 0.8615520596504211, "rewards/fixed_code_pass_all_test_reward/mean": 0.41826921701431274, "rewards/fixed_code_pass_all_test_reward/std": 0.48408201336860657, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4520675837993622, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.24571112340896514, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.06967539805918932, "learning_rate": 1.6361401352181932e-05, "loss": 0.0028, "num_tokens": 10726367.0, "reward": 1.9627976417541504, "reward_std": 0.42655596137046814, "rewards/fixed_code_pass_all_test_reward/mean": 0.8377976417541504, "rewards/fixed_code_pass_all_test_reward/std": 0.10298775136470795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 107.5, "completions/mean_terminated_length": 107.5, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.24589559121933222, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.11437387764453888, "learning_rate": 1.6373693915181318e-05, "loss": 0.0046, "num_tokens": 10732779.0, "reward": 2.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 268.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.24608005902969932, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.05479431990534067, "learning_rate": 1.63859864781807e-05, "loss": 0.0022, "num_tokens": 10741432.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.24626452684006642, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.07762267719954252, "learning_rate": 1.6398279041180087e-05, "loss": 0.0031, "num_tokens": 10749607.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 65.75, "completions/mean_terminated_length": 65.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.2464489946504335, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.13166031055152416, "learning_rate": 1.6410571604179473e-05, "loss": 0.0053, "num_tokens": 10752877.0, "reward": 1.2395833730697632, "reward_std": 0.38688188791275024, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2395833432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38688185811042786, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.2466334624608006, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.03632687125355005, "learning_rate": 1.642286416717886e-05, "loss": 0.0015, "num_tokens": 10758835.0, "reward": 1.5625, "reward_std": 0.7288689613342285, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4955156147480011, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 99.25, "completions/mean_terminated_length": 99.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.2468179302711677, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.08312083594501019, "learning_rate": 1.6435156730178245e-05, "loss": 0.0033, "num_tokens": 10762557.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 67.5, "completions/mean_terminated_length": 67.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.24700239808153476, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.10557534173130989, "learning_rate": 1.6447449293177627e-05, "loss": 0.0042, "num_tokens": 10765921.0, "reward": 2.8958334922790527, "reward_std": 0.19795580208301544, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8958333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19795581698417664, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.24718686589190186, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.07546161999925971, "learning_rate": 1.6459741856177013e-05, "loss": 0.003, "num_tokens": 10776187.0, "reward": 2.0027174949645996, "reward_std": 0.5505058169364929, "rewards/fixed_code_pass_all_test_reward/mean": 0.19021737575531006, "rewards/fixed_code_pass_all_test_reward/std": 0.32861340045928955, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 44.625, "completions/mean_terminated_length": 44.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.24737133370226896, "frac_reward_zero_std": 1.0, "grad_norm": 0.234375, "kl": 0.15805933717638254, "learning_rate": 1.64720344191764e-05, "loss": 0.0063, "num_tokens": 10779248.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 92.875, "completions/mean_terminated_length": 92.875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.24755580151263604, "frac_reward_zero_std": 1.0, "grad_norm": 0.2314453125, "kl": 0.11395868705585599, "learning_rate": 1.6484326982175785e-05, "loss": 0.0046, "num_tokens": 10782767.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.24774026932300314, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.05025406228378415, "learning_rate": 1.649661954517517e-05, "loss": 0.002, "num_tokens": 10790183.0, "reward": 1.9166667461395264, "reward_std": 0.6606874465942383, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5416666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.501980185508728, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.24792473713337024, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.12230105046182871, "learning_rate": 1.6508912108174554e-05, "loss": 0.0049, "num_tokens": 10798314.0, "reward": 2.3532607555389404, "reward_std": 0.47082728147506714, "rewards/fixed_code_pass_all_test_reward/mean": 0.47826087474823, "rewards/fixed_code_pass_all_test_reward/std": 0.185920849442482, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.2481092049437373, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.15308861015364528, "learning_rate": 1.652120467117394e-05, "loss": 0.0061, "num_tokens": 10803876.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2482936727541044, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.10927761159837246, "learning_rate": 1.6533497234173326e-05, "loss": 0.0044, "num_tokens": 10812652.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2484781405644715, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.07254290953278542, "learning_rate": 1.6545789797172712e-05, "loss": 0.0029, "num_tokens": 10819365.0, "reward": 1.610465168952942, "reward_std": 0.07113827764987946, "rewards/fixed_code_pass_all_test_reward/mean": 0.6104651689529419, "rewards/fixed_code_pass_all_test_reward/std": 0.07113825529813766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.24866260837483858, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.13220136985182762, "learning_rate": 1.6558082360172098e-05, "loss": 0.0053, "num_tokens": 10828468.0, "reward": 2.81756329536438, "reward_std": 0.31439217925071716, "rewards/fixed_code_pass_all_test_reward/mean": 0.905063271522522, "rewards/fixed_code_pass_all_test_reward/std": 0.17578864097595215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9125000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.24884707618520568, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.1284906566143036, "learning_rate": 1.657037492317148e-05, "loss": 0.0051, "num_tokens": 10835064.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 127.125, "completions/mean_terminated_length": 127.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.24903154399557278, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.1684915255755186, "learning_rate": 1.6582667486170867e-05, "loss": 0.0067, "num_tokens": 10843105.0, "reward": 2.909090995788574, "reward_std": 0.2571297883987427, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.24921601180593986, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.16587179526686668, "learning_rate": 1.6594960049170253e-05, "loss": 0.0066, "num_tokens": 10846282.0, "reward": 2.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.24940047961630696, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.1313049104064703, "learning_rate": 1.660725261216964e-05, "loss": 0.0053, "num_tokens": 10854176.0, "reward": 1.8333333730697632, "reward_std": 0.8357109427452087, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3563483655452728, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 687.75, "completions/mean_terminated_length": 493.4285888671875, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.24958494742667406, "frac_reward_zero_std": 0.0, "grad_norm": 0.51953125, "kl": 0.035347149649169296, "learning_rate": 1.6619545175169025e-05, "loss": 0.0014, "num_tokens": 10872206.0, "reward": 1.547619104385376, "reward_std": 0.7167757153511047, "rewards/fixed_code_pass_all_test_reward/mean": 0.6726190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.4433853328227997, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.24976941523704113, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.13538733031600714, "learning_rate": 1.663183773816841e-05, "loss": 0.0054, "num_tokens": 10880023.0, "reward": 1.759615421295166, "reward_std": 0.46959546208381653, "rewards/fixed_code_pass_all_test_reward/mean": 0.009615384973585606, "rewards/fixed_code_pass_all_test_reward/std": 0.02719641663134098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.24995388304740823, "frac_reward_zero_std": 1.0, "grad_norm": 0.2294921875, "kl": 0.08189521823078394, "learning_rate": 1.6644130301167797e-05, "loss": 0.0033, "num_tokens": 10890606.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.2501383508577753, "frac_reward_zero_std": 1.0, "grad_norm": 0.3515625, "kl": 0.07156744226813316, "learning_rate": 1.6656422864167183e-05, "loss": 0.0029, "num_tokens": 10899124.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 323.625, "completions/mean_terminated_length": 323.625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.25032281866814243, "frac_reward_zero_std": 1.0, "grad_norm": 0.056884765625, "kl": 0.04858022090047598, "learning_rate": 1.6668715427166565e-05, "loss": 0.0019, "num_tokens": 10910889.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2505072864785095, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.08275308879092336, "learning_rate": 1.668100799016595e-05, "loss": 0.0033, "num_tokens": 10917478.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2506917542888766, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.08182646008208394, "learning_rate": 1.6693300553165337e-05, "loss": 0.0033, "num_tokens": 10922186.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2508762220992437, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.11860853340476751, "learning_rate": 1.6705593116164723e-05, "loss": 0.0047, "num_tokens": 10930348.0, "reward": 2.644230842590332, "reward_std": 0.6964921951293945, "rewards/fixed_code_pass_all_test_reward/mean": 0.7692307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.4292752742767334, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.2510606899096108, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.08985923836007714, "learning_rate": 1.671788567916411e-05, "loss": 0.0036, "num_tokens": 10934388.0, "reward": 1.649999976158142, "reward_std": 0.6777647733688354, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15000000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24364949762821198, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 369.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.25124515771997785, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.06613282253965735, "learning_rate": 1.6730178242163492e-05, "loss": 0.0026, "num_tokens": 10942775.0, "reward": 1.4749999046325684, "reward_std": 0.6584613919258118, "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.37032803893089294, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.251429625530345, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0675462200306356, "learning_rate": 1.6742470805162878e-05, "loss": 0.0027, "num_tokens": 10951056.0, "reward": 2.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.25161409334071205, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.15534687135368586, "learning_rate": 1.6754763368162264e-05, "loss": 0.0062, "num_tokens": 10959391.0, "reward": 2.6182432174682617, "reward_std": 0.42154181003570557, "rewards/fixed_code_pass_all_test_reward/mean": 0.7432432174682617, "rewards/fixed_code_pass_all_test_reward/std": 0.3550444543361664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2517985611510791, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.06787957577034831, "learning_rate": 1.676705593116165e-05, "loss": 0.0027, "num_tokens": 10964987.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.25198302896144625, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.08423251891508698, "learning_rate": 1.6779348494161036e-05, "loss": 0.0034, "num_tokens": 10973152.0, "reward": 2.6875, "reward_std": 0.31654128432273865, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.21906864643096924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.84375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24574755132198334, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 103.0, "completions/mean_terminated_length": 103.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.2521674967718133, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.1008045426569879, "learning_rate": 1.679164105716042e-05, "loss": 0.004, "num_tokens": 10976704.0, "reward": 2.179166555404663, "reward_std": 0.11910714954137802, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17916665971279144, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11910713464021683, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 375.0, "completions/mean_terminated_length": 375.0, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.2523519645821804, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.04579393193125725, "learning_rate": 1.6803933620159804e-05, "loss": 0.0018, "num_tokens": 10987056.0, "reward": 2.4342105388641357, "reward_std": 0.37375301122665405, "rewards/fixed_code_pass_all_test_reward/mean": 0.43421053886413574, "rewards/fixed_code_pass_all_test_reward/std": 0.37375304102897644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2525364323925475, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.04956584284082055, "learning_rate": 1.681622618315919e-05, "loss": 0.002, "num_tokens": 10991277.0, "reward": 1.5493056774139404, "reward_std": 0.1231364831328392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5493055582046509, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12313646823167801, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2527209002029146, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.0755920191295445, "learning_rate": 1.6828518746158576e-05, "loss": 0.003, "num_tokens": 10996823.0, "reward": 1.8287036418914795, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.7037037014961243, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.2529053680132817, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.17201752867549658, "learning_rate": 1.684081130915796e-05, "loss": 0.0069, "num_tokens": 11005287.0, "reward": 2.5741524696350098, "reward_std": 0.4124341905117035, "rewards/fixed_code_pass_all_test_reward/mean": 0.5741525292396545, "rewards/fixed_code_pass_all_test_reward/std": 0.4124342203140259, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.25308983582364875, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.06273473193868995, "learning_rate": 1.6853103872157345e-05, "loss": 0.0025, "num_tokens": 11015023.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2532743036340159, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.05027552065439522, "learning_rate": 1.686539643515673e-05, "loss": 0.002, "num_tokens": 11020442.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 81.0, "completions/mean_terminated_length": 81.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.25345877144438295, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.12039214884862304, "learning_rate": 1.6877688998156117e-05, "loss": 0.0048, "num_tokens": 11023914.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 152.0, "completions/mean_terminated_length": 152.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.25364323925475, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.05422212043777108, "learning_rate": 1.6889981561155503e-05, "loss": 0.0022, "num_tokens": 11028394.0, "reward": 2.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.25382770706511715, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.05865707085467875, "learning_rate": 1.6902274124154886e-05, "loss": 0.0023, "num_tokens": 11035476.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2540121748754842, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.0483575772959739, "learning_rate": 1.691456668715427e-05, "loss": 0.0019, "num_tokens": 11039466.0, "reward": 2.1875, "reward_std": 0.2736801505088806, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27368009090423584, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 535.375, "completions/mean_terminated_length": 535.375, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.2541966426858513, "frac_reward_zero_std": 0.0, "grad_norm": 0.5234375, "kl": 0.018780278624035418, "learning_rate": 1.6926859250153658e-05, "loss": 0.0008, "num_tokens": 11050389.0, "reward": 2.1613316535949707, "reward_std": 0.3558548390865326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9506173133850098, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21071428060531616, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3558548390865326, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.2543811104962184, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.06546738184988499, "learning_rate": 1.6939151813153044e-05, "loss": 0.0026, "num_tokens": 11054400.0, "reward": 1.8562500476837158, "reward_std": 0.7925715446472168, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.48124998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.44875821471214294, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.2545655783065855, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.056478331098333, "learning_rate": 1.695144437615243e-05, "loss": 0.0023, "num_tokens": 11061220.0, "reward": 2.016369104385376, "reward_std": 0.6060560345649719, "rewards/fixed_code_pass_all_test_reward/mean": 0.3080357313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.1936844438314438, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7083333134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4249182939529419, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 349.875, "completions/mean_terminated_length": 349.875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.25475004611695257, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.05083748046308756, "learning_rate": 1.6963736939151812e-05, "loss": 0.002, "num_tokens": 11073419.0, "reward": 2.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.2549345139273197, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.06106687826104462, "learning_rate": 1.6976029502151198e-05, "loss": 0.0024, "num_tokens": 11078114.0, "reward": 2.450000047683716, "reward_std": 0.6584613919258118, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.574999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3798496127128601, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 164.125, "completions/mean_terminated_length": 164.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.25511898173768677, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.07553768111392856, "learning_rate": 1.6988322065150584e-05, "loss": 0.003, "num_tokens": 11083243.0, "reward": 1.6609195470809937, "reward_std": 0.3443346321582794, "rewards/fixed_code_pass_all_test_reward/mean": 0.45258623361587524, "rewards/fixed_code_pass_all_test_reward/std": 0.10972344875335693, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.25530344954805384, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0718183689750731, "learning_rate": 1.700061462814997e-05, "loss": 0.0029, "num_tokens": 11092623.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.25548791735842097, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.05398297915235162, "learning_rate": 1.7012907191149356e-05, "loss": 0.0022, "num_tokens": 11101597.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 87.125, "completions/mean_terminated_length": 87.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.25567238516878804, "frac_reward_zero_std": 1.0, "grad_norm": 0.5234375, "kl": 0.1999575849622488, "learning_rate": 1.7025199754148742e-05, "loss": 0.008, "num_tokens": 11110270.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 681.25, "completions/mean_terminated_length": 225.6666717529297, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.2558568529791551, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.04933039493334945, "learning_rate": 1.7037492317148128e-05, "loss": 0.002, "num_tokens": 11119312.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 226.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.25604132078952224, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.049146117526106536, "learning_rate": 1.7049784880147514e-05, "loss": 0.002, "num_tokens": 11125467.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 857.25, "completions/mean_terminated_length": 460.3333435058594, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.2562257885998893, "frac_reward_zero_std": 0.0, "grad_norm": 0.59765625, "kl": 0.04159418327617459, "learning_rate": 1.7062077443146897e-05, "loss": 0.0017, "num_tokens": 11138397.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2564102564102564, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.07474727975204587, "learning_rate": 1.7074370006146283e-05, "loss": 0.003, "num_tokens": 11144658.0, "reward": 1.9122023582458496, "reward_std": 0.35055941343307495, "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1979166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35055938363075256, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 185.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2565947242206235, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.07740375678986311, "learning_rate": 1.708666256914567e-05, "loss": 0.0031, "num_tokens": 11150516.0, "reward": 1.7857142686843872, "reward_std": 0.23224091529846191, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.2322409451007843, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 78.75, "completions/mean_terminated_length": 78.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.2567791920309906, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.1062324708327651, "learning_rate": 1.7098955132145055e-05, "loss": 0.0042, "num_tokens": 11154034.0, "reward": 2.1666667461395264, "reward_std": 0.2519763708114624, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25197634100914, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.25696365984135766, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.06663242494687438, "learning_rate": 1.711124769514444e-05, "loss": 0.0027, "num_tokens": 11159668.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.2571481276517248, "frac_reward_zero_std": 1.0, "grad_norm": 0.1884765625, "kl": 0.07217319449409842, "learning_rate": 1.7123540258143823e-05, "loss": 0.0029, "num_tokens": 11166645.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.25733259546209186, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.08477358054369688, "learning_rate": 1.713583282114321e-05, "loss": 0.0034, "num_tokens": 11174793.0, "reward": 2.5892858505249023, "reward_std": 0.3874863386154175, "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.3874865174293518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 140.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.25751706327245893, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.09939438477158546, "learning_rate": 1.7148125384142595e-05, "loss": 0.004, "num_tokens": 11179840.0, "reward": 1.6602565050125122, "reward_std": 0.49601587653160095, "rewards/fixed_code_pass_all_test_reward/mean": 0.07692307978868484, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5833333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.49601587653160095, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 118.375, "completions/mean_terminated_length": 118.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.25770153108282606, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.22641517128795385, "learning_rate": 1.716041794714198e-05, "loss": 0.0091, "num_tokens": 11186787.0, "reward": 1.9575892686843872, "reward_std": 0.038105517625808716, "rewards/fixed_code_pass_all_test_reward/mean": 0.9575892686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.03810553625226021, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.25788599889319314, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.06381628010421991, "learning_rate": 1.7172710510141367e-05, "loss": 0.0026, "num_tokens": 11190875.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.2580704667035602, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.06616952829062939, "learning_rate": 1.718500307314075e-05, "loss": 0.0026, "num_tokens": 11196124.0, "reward": 1.7777777910232544, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 105.625, "completions/mean_terminated_length": 105.625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.25825493451392734, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.057234080508351326, "learning_rate": 1.7197295636140136e-05, "loss": 0.0023, "num_tokens": 11199905.0, "reward": 2.25, "reward_std": 0.6546536684036255, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4432026445865631, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 306.5, "completions/mean_terminated_length": 306.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2584394023242944, "frac_reward_zero_std": 0.0, "grad_norm": 0.6640625, "kl": 0.02644269692245871, "learning_rate": 1.7209588199139522e-05, "loss": 0.0011, "num_tokens": 11206741.0, "reward": 1.9810607433319092, "reward_std": 0.5435724258422852, "rewards/fixed_code_pass_all_test_reward/mean": 0.5227272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.2945791482925415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4583333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3646045923233032, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.2586238701346615, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.07416461128741503, "learning_rate": 1.7221880762138908e-05, "loss": 0.003, "num_tokens": 11216084.0, "reward": 1.91304349899292, "reward_std": 0.1523955762386322, "rewards/fixed_code_pass_all_test_reward/mean": 0.9130434989929199, "rewards/fixed_code_pass_all_test_reward/std": 0.1523955911397934, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 96.0, "completions/mean_terminated_length": 96.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2588083379450286, "frac_reward_zero_std": 0.0, "grad_norm": 7.34375, "kl": 0.15847757551819086, "learning_rate": 1.7234173325138294e-05, "loss": 0.0063, "num_tokens": 11223892.0, "reward": 1.9305555820465088, "reward_std": 0.7990293502807617, "rewards/fixed_code_pass_all_test_reward/mean": 0.1805555522441864, "rewards/fixed_code_pass_all_test_reward/std": 0.18781206011772156, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.2589928057553957, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.09141491958871484, "learning_rate": 1.7246465888137677e-05, "loss": 0.0037, "num_tokens": 11232348.0, "reward": 2.5, "reward_std": 1.0690449476242065, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 450.625, "completions/mean_terminated_length": 450.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.25917727356576276, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.012602392205735669, "learning_rate": 1.7258758451137063e-05, "loss": 0.0005, "num_tokens": 11241025.0, "reward": 2.493990421295166, "reward_std": 0.07083339989185333, "rewards/fixed_code_pass_all_test_reward/mean": 0.884615421295166, "rewards/fixed_code_pass_all_test_reward/std": 0.041117113083601, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.609375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 83.375, "completions/mean_terminated_length": 83.375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2593617413761299, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.10982017777860165, "learning_rate": 1.727105101413645e-05, "loss": 0.0044, "num_tokens": 11244444.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.25954620918649696, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.057777900248765945, "learning_rate": 1.7283343577135835e-05, "loss": 0.0023, "num_tokens": 11253465.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.25973067699686403, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.05025557987391949, "learning_rate": 1.729563614013522e-05, "loss": 0.002, "num_tokens": 11257611.0, "reward": 2.049999952316284, "reward_std": 0.5424810647964478, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17500001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.345377653837204, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.25991514480723116, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.05537971993908286, "learning_rate": 1.7307928703134603e-05, "loss": 0.0022, "num_tokens": 11266630.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.26009961261759823, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.058540979865938425, "learning_rate": 1.732022126613399e-05, "loss": 0.0023, "num_tokens": 11270622.0, "reward": 2.9166667461395264, "reward_std": 0.15430328249931335, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15430334210395813, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 284.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.2602840804279653, "frac_reward_zero_std": 0.0, "grad_norm": 0.5859375, "kl": 0.008400811930187047, "learning_rate": 1.7332513829133375e-05, "loss": 0.0003, "num_tokens": 11276882.0, "reward": 2.1750001907348633, "reward_std": 0.07071070373058319, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17500001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 170.125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.26046854823833243, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.04452551936265081, "learning_rate": 1.734480639213276e-05, "loss": 0.0018, "num_tokens": 11281603.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.2606530160486995, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.07216860249172896, "learning_rate": 1.7357098955132147e-05, "loss": 0.0029, "num_tokens": 11285579.0, "reward": 1.540624976158142, "reward_std": 0.9537461400032043, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16562500596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34561994671821594, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2608374838590666, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.09506922215223312, "learning_rate": 1.736939151813153e-05, "loss": 0.0038, "num_tokens": 11293706.0, "reward": 2.7272725105285645, "reward_std": 0.376399427652359, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.3763993978500366, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.2610219516694337, "frac_reward_zero_std": 1.0, "grad_norm": 0.18359375, "kl": 0.08957654424011707, "learning_rate": 1.7381684081130916e-05, "loss": 0.0036, "num_tokens": 11299618.0, "reward": 1.8095238208770752, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8095238208770752, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 141.625, "completions/mean_terminated_length": 141.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2612064194798008, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.09594391379505396, "learning_rate": 1.7393976644130302e-05, "loss": 0.0038, "num_tokens": 11307615.0, "reward": 2.375, "reward_std": 1.0606601238250732, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 340.625, "completions/mean_terminated_length": 340.625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.26139088729016785, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.0529429716989398, "learning_rate": 1.7406269207129688e-05, "loss": 0.0021, "num_tokens": 11315588.0, "reward": 1.557692289352417, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.3076923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 264.625, "completions/mean_terminated_length": 264.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.261575355100535, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.024304215097799897, "learning_rate": 1.7418561770129074e-05, "loss": 0.001, "num_tokens": 11321777.0, "reward": 2.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.26175982291090205, "frac_reward_zero_std": 1.0, "grad_norm": 0.244140625, "kl": 0.10675794305279851, "learning_rate": 1.743085433312846e-05, "loss": 0.0043, "num_tokens": 11326759.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2619442907212691, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.0645335135050118, "learning_rate": 1.7443146896127846e-05, "loss": 0.0026, "num_tokens": 11335866.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 111.75, "completions/mean_terminated_length": 111.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.26212875853163625, "frac_reward_zero_std": 1.0, "grad_norm": 0.34375, "kl": 0.14564611203968525, "learning_rate": 1.7455439459127232e-05, "loss": 0.0058, "num_tokens": 11342816.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.2623132263420033, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.05542821204289794, "learning_rate": 1.7467732022126614e-05, "loss": 0.0022, "num_tokens": 11346801.0, "reward": 1.8520833253860474, "reward_std": 0.5444844365119934, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.35208332538604736, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3291930556297302, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.2624976941523704, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.08585123904049397, "learning_rate": 1.7480024585126e-05, "loss": 0.0034, "num_tokens": 11350513.0, "reward": 2.8645834922790527, "reward_std": 0.1988932490348816, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8645833730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19889327883720398, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2626821619627375, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.08700496330857277, "learning_rate": 1.7492317148125386e-05, "loss": 0.0035, "num_tokens": 11360089.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2628666297731046, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.14127080794423819, "learning_rate": 1.7504609711124772e-05, "loss": 0.0057, "num_tokens": 11365423.0, "reward": 1.7028303146362305, "reward_std": 0.25213322043418884, "rewards/fixed_code_pass_all_test_reward/mean": 0.7028301954269409, "rewards/fixed_code_pass_all_test_reward/std": 0.25213325023651123, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 277.625, "completions/mean_terminated_length": 277.625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.26305109758347167, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.009281243837904185, "learning_rate": 1.751690227412416e-05, "loss": 0.0004, "num_tokens": 11371444.0, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.2632355653938388, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.08914761384949088, "learning_rate": 1.752919483712354e-05, "loss": 0.0036, "num_tokens": 11379427.0, "reward": 2.363888740539551, "reward_std": 0.24608032405376434, "rewards/fixed_code_pass_all_test_reward/mean": 0.9013888835906982, "rewards/fixed_code_pass_all_test_reward/std": 0.05892555043101311, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4625000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23260943591594696, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 113.875, "completions/mean_terminated_length": 113.875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.2634200332042059, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.15443378500640392, "learning_rate": 1.7541487400122927e-05, "loss": 0.0062, "num_tokens": 11387162.0, "reward": 2.8970589637756348, "reward_std": 0.29116159677505493, "rewards/fixed_code_pass_all_test_reward/mean": 0.8970588445663452, "rewards/fixed_code_pass_all_test_reward/std": 0.2911616265773773, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.26360450101457295, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.06624418776482344, "learning_rate": 1.7553779963122313e-05, "loss": 0.0027, "num_tokens": 11393894.0, "reward": 1.6800596714019775, "reward_std": 0.25064197182655334, "rewards/fixed_code_pass_all_test_reward/mean": 0.6071429252624512, "rewards/fixed_code_pass_all_test_reward/std": 0.2099909782409668, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 114.75, "completions/mean_terminated_length": 114.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2637889688249401, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.1351399663835764, "learning_rate": 1.75660725261217e-05, "loss": 0.0054, "num_tokens": 11401604.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.26397343663530715, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.12897243443876505, "learning_rate": 1.757836508912108e-05, "loss": 0.0052, "num_tokens": 11407604.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.2641579044456742, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.07921831589192152, "learning_rate": 1.7590657652120468e-05, "loss": 0.0032, "num_tokens": 11418203.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.26434237225604135, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.12223133677616715, "learning_rate": 1.7602950215119854e-05, "loss": 0.0049, "num_tokens": 11423886.0, "reward": 1.2622158527374268, "reward_std": 0.3202367126941681, "rewards/fixed_code_pass_all_test_reward/mean": 0.034090910106897354, "rewards/fixed_code_pass_all_test_reward/std": 0.047049928456544876, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22812500596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34367695450782776, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2645268400664084, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.0659632463939488, "learning_rate": 1.761524277811924e-05, "loss": 0.0026, "num_tokens": 11430302.0, "reward": 1.4946236610412598, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.49462366104125977, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.2647113078767755, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.06426241435110569, "learning_rate": 1.7627535341118626e-05, "loss": 0.0026, "num_tokens": 11437204.0, "reward": 1.421875, "reward_std": 0.7194290161132812, "rewards/fixed_code_pass_all_test_reward/mean": 0.296875, "rewards/fixed_code_pass_all_test_reward/std": 0.4378187656402588, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 141.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.2648957756871426, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.11386467609554529, "learning_rate": 1.7639827904118008e-05, "loss": 0.0046, "num_tokens": 11445211.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.2650802434975097, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.06811228534206748, "learning_rate": 1.7652120467117394e-05, "loss": 0.0027, "num_tokens": 11450958.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 416.875, "completions/mean_terminated_length": 416.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.26526471130787677, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.05079311411827803, "learning_rate": 1.766441303011678e-05, "loss": 0.002, "num_tokens": 11459069.0, "reward": 1.8125, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 411.875, "completions/mean_terminated_length": 411.875, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.26544917911824384, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.055898024467751384, "learning_rate": 1.7676705593116166e-05, "loss": 0.0022, "num_tokens": 11471740.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 217.375, "completions/mean_terminated_length": 217.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.26563364692861097, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.12303999764844775, "learning_rate": 1.7688998156115552e-05, "loss": 0.0049, "num_tokens": 11477719.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.26581811473897804, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.01614483119919896, "learning_rate": 1.7701290719114935e-05, "loss": 0.0006, "num_tokens": 11483296.0, "reward": 2.1118597984313965, "reward_std": 0.060521356761455536, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11185966432094574, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.06052136421203613, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 276.875, "completions/mean_terminated_length": 276.875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.2660025825493451, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.057489358354359865, "learning_rate": 1.771358328211432e-05, "loss": 0.0023, "num_tokens": 11490335.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.26618705035971224, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.023353609954938293, "learning_rate": 1.7725875845113707e-05, "loss": 0.0009, "num_tokens": 11494897.0, "reward": 2.4375, "reward_std": 0.5062113404273987, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2787087559700012, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.2663715181700793, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.050131677533499897, "learning_rate": 1.7738168408113093e-05, "loss": 0.002, "num_tokens": 11499236.0, "reward": 2.227083206176758, "reward_std": 0.5495985150337219, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.35208332538604736, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2765199840068817, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 116.25, "completions/mean_terminated_length": 116.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.2665559859804464, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.1154615324921906, "learning_rate": 1.775046097111248e-05, "loss": 0.0046, "num_tokens": 11503822.0, "reward": 1.6570076942443848, "reward_std": 0.3347059190273285, "rewards/fixed_code_pass_all_test_reward/mean": 0.40909093618392944, "rewards/fixed_code_pass_all_test_reward/std": 0.048592954874038696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24791666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3468037247657776, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2667404537908135, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0618455451913178, "learning_rate": 1.776275353411186e-05, "loss": 0.0025, "num_tokens": 11510065.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 295.25, "completions/mean_terminated_length": 295.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.2669249216011806, "frac_reward_zero_std": 0.0, "grad_norm": 0.71484375, "kl": 0.030481564812362194, "learning_rate": 1.7775046097111248e-05, "loss": 0.0012, "num_tokens": 11516379.0, "reward": 2.5625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.26710938941154766, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.08304304955527186, "learning_rate": 1.7787338660110634e-05, "loss": 0.0033, "num_tokens": 11524475.0, "reward": 1.5855262279510498, "reward_std": 0.16747263073921204, "rewards/fixed_code_pass_all_test_reward/mean": 0.5855263471603394, "rewards/fixed_code_pass_all_test_reward/std": 0.16747266054153442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 144.5, "completions/mean_terminated_length": 144.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2672938572219148, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.19294750783592463, "learning_rate": 1.779963122311002e-05, "loss": 0.0077, "num_tokens": 11531599.0, "reward": 1.708482265472412, "reward_std": 0.0877508595585823, "rewards/fixed_code_pass_all_test_reward/mean": 0.6584821343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.11471439152956009, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 488.25, "completions/mean_terminated_length": 488.25, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.26747832503228186, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.04027494980255142, "learning_rate": 1.7811923786109406e-05, "loss": 0.0016, "num_tokens": 11540473.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.26766279284264893, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.11970480950549245, "learning_rate": 1.782421634910879e-05, "loss": 0.0048, "num_tokens": 11548543.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 540.125, "completions/mean_terminated_length": 540.125, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.26784726065301606, "frac_reward_zero_std": 1.0, "grad_norm": 0.041259765625, "kl": 0.021397673175670207, "learning_rate": 1.7836508912108178e-05, "loss": 0.0009, "num_tokens": 11562064.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 224.125, "completions/mean_terminated_length": 224.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.26803172846338313, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.05574781307950616, "learning_rate": 1.7848801475107564e-05, "loss": 0.0022, "num_tokens": 11567417.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 722.625, "completions/mean_terminated_length": 533.2857666015625, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.2682161962737502, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.03800541383679956, "learning_rate": 1.7861094038106946e-05, "loss": 0.0015, "num_tokens": 11580030.0, "reward": 1.5, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.26840066408411734, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.07330772932618856, "learning_rate": 1.7873386601106332e-05, "loss": 0.0029, "num_tokens": 11587584.0, "reward": 2.2083334922790527, "reward_std": 0.17251645028591156, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2685851318944844, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.058170861564576626, "learning_rate": 1.7885679164105718e-05, "loss": 0.0023, "num_tokens": 11594260.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 425.625, "completions/mean_terminated_length": 425.625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.2687695997048515, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.030446207616478205, "learning_rate": 1.7897971727105104e-05, "loss": 0.0012, "num_tokens": 11602793.0, "reward": 2.544642925262451, "reward_std": 0.7730307579040527, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.2689540675152186, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.06003547366708517, "learning_rate": 1.791026429010449e-05, "loss": 0.0024, "num_tokens": 11607511.0, "reward": 2.0500001907348633, "reward_std": 0.5732115507125854, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.42500001192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.31052953004837036, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.2691385353255857, "frac_reward_zero_std": 1.0, "grad_norm": 0.2041015625, "kl": 0.08987831743434072, "learning_rate": 1.7922556853103873e-05, "loss": 0.0036, "num_tokens": 11611536.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.26932300313595275, "frac_reward_zero_std": 1.0, "grad_norm": 0.4453125, "kl": 0.09817486396059394, "learning_rate": 1.793484941610326e-05, "loss": 0.0039, "num_tokens": 11615940.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.2695074709463199, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.04399872059002519, "learning_rate": 1.7947141979102645e-05, "loss": 0.0018, "num_tokens": 11622149.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 622.625, "completions/mean_terminated_length": 419.0000305175781, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.26969193875668696, "frac_reward_zero_std": 0.0, "grad_norm": 0.578125, "kl": 0.07350413559470326, "learning_rate": 1.795943454210203e-05, "loss": 0.0029, "num_tokens": 11634154.0, "reward": 2.375, "reward_std": 1.0606601238250732, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.26987640656705403, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0526186206843704, "learning_rate": 1.7971727105101417e-05, "loss": 0.0021, "num_tokens": 11643700.0, "reward": 2.7291667461395264, "reward_std": 0.4416554570198059, "rewards/fixed_code_pass_all_test_reward/mean": 0.9125000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2474873811006546, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8166666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22466905415058136, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.27006087437742116, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.11233165953308344, "learning_rate": 1.79840196681008e-05, "loss": 0.0045, "num_tokens": 11650789.0, "reward": 2.875, "reward_std": 0.2480078637599945, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.27024534218778823, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.042274029343388975, "learning_rate": 1.7996312231100185e-05, "loss": 0.0017, "num_tokens": 11657829.0, "reward": 2.0885868072509766, "reward_std": 0.26015105843544006, "rewards/fixed_code_pass_all_test_reward/mean": 0.8260869383811951, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.26249998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.26015105843544006, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 395.75, "completions/mean_terminated_length": 395.75, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.2704298099981553, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.03407468739897013, "learning_rate": 1.800860479409957e-05, "loss": 0.0014, "num_tokens": 11665667.0, "reward": 2.466071367263794, "reward_std": 0.6563854813575745, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5910714268684387, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3700868785381317, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.27061427780852243, "frac_reward_zero_std": 1.0, "grad_norm": 0.8515625, "kl": 0.1234976164996624, "learning_rate": 1.8020897357098957e-05, "loss": 0.0049, "num_tokens": 11673316.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.2707987456188895, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0925372801721096, "learning_rate": 1.8033189920098343e-05, "loss": 0.0037, "num_tokens": 11676990.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 526.5, "completions/mean_terminated_length": 526.5, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.2709832134292566, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.03790547593962401, "learning_rate": 1.8045482483097726e-05, "loss": 0.0015, "num_tokens": 11683954.0, "reward": 1.4305555820465088, "reward_std": 0.39394670724868774, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4305555820465088, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39394670724868774, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2711676812396237, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0883001834154129, "learning_rate": 1.8057775046097112e-05, "loss": 0.0035, "num_tokens": 11691137.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.2713521490499908, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.06467348663136363, "learning_rate": 1.8070067609096498e-05, "loss": 0.0026, "num_tokens": 11700229.0, "reward": 2.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 116.125, "completions/mean_terminated_length": 116.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.27153661686035785, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.07072609732858837, "learning_rate": 1.8082360172095884e-05, "loss": 0.0028, "num_tokens": 11710910.0, "reward": 1.917628288269043, "reward_std": 0.8263317346572876, "rewards/fixed_code_pass_all_test_reward/mean": 0.5384615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.21757133305072784, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5041666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3520630896091461, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.271721084670725, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.05860043177381158, "learning_rate": 1.809465273509527e-05, "loss": 0.0023, "num_tokens": 11716657.0, "reward": 1.8578431606292725, "reward_std": 0.4089875817298889, "rewards/fixed_code_pass_all_test_reward/mean": 0.8161764740943909, "rewards/fixed_code_pass_all_test_reward/std": 0.3686121702194214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 118.875, "completions/mean_terminated_length": 118.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.27190555248109205, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.06537326332181692, "learning_rate": 1.8106945298094653e-05, "loss": 0.0026, "num_tokens": 11720616.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.2720900202914591, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.07062651007436216, "learning_rate": 1.811923786109404e-05, "loss": 0.0028, "num_tokens": 11728543.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.27227448810182625, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.03528321394696832, "learning_rate": 1.8131530424093425e-05, "loss": 0.0014, "num_tokens": 11734177.0, "reward": 2.551785707473755, "reward_std": 0.6340870261192322, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6767857074737549, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.28930777311325073, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 73.5, "completions/mean_terminated_length": 73.5, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.2724589559121933, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.12216265825554729, "learning_rate": 1.814382298709281e-05, "loss": 0.0049, "num_tokens": 11737573.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.2726434237225604, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.07946922304108739, "learning_rate": 1.8156115550092193e-05, "loss": 0.0032, "num_tokens": 11747959.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2728278915329275, "frac_reward_zero_std": 0.0, "grad_norm": 0.5703125, "kl": 0.05615282850340009, "learning_rate": 1.816840811309158e-05, "loss": 0.0022, "num_tokens": 11756163.0, "reward": 2.75, "reward_std": 0.06428251415491104, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.2730123593432946, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.03722208086401224, "learning_rate": 1.8180700676090965e-05, "loss": 0.0015, "num_tokens": 11761966.0, "reward": 2.2593750953674316, "reward_std": 0.24346362054347992, "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, "rewards/fixed_code_pass_all_test_reward/std": 0.1882425844669342, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4624999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14078859984874725, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.27319682715366167, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.06863328511826694, "learning_rate": 1.819299323909035e-05, "loss": 0.0027, "num_tokens": 11771354.0, "reward": 2.0833334922790527, "reward_std": 0.38832157850265503, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2733812949640288, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.05706950859166682, "learning_rate": 1.8205285802089737e-05, "loss": 0.0023, "num_tokens": 11775692.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.27356576277439587, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.06316191283985972, "learning_rate": 1.8217578365089123e-05, "loss": 0.0025, "num_tokens": 11783242.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 393.5, "completions/mean_terminated_length": 393.5, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.27375023058476294, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.03274623933248222, "learning_rate": 1.822987092808851e-05, "loss": 0.0013, "num_tokens": 11792182.0, "reward": 1.4499999284744263, "reward_std": 0.4359554350376129, "rewards/fixed_code_pass_all_test_reward/mean": 0.45000001788139343, "rewards/fixed_code_pass_all_test_reward/std": 0.4359554350376129, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.27393469839513007, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.08216463541612029, "learning_rate": 1.8242163491087895e-05, "loss": 0.0033, "num_tokens": 11800187.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 111.0, "completions/mean_terminated_length": 111.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.27411916620549714, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.07342926831915975, "learning_rate": 1.825445605408728e-05, "loss": 0.0029, "num_tokens": 11804235.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.2743036340158642, "frac_reward_zero_std": 1.0, "grad_norm": 0.032958984375, "kl": 0.04658154840581119, "learning_rate": 1.8266748617086664e-05, "loss": 0.0019, "num_tokens": 11813287.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.27448810182623135, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.06774209346622229, "learning_rate": 1.827904118008605e-05, "loss": 0.0027, "num_tokens": 11820475.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.2746725696365984, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.07274021347984672, "learning_rate": 1.8291333743085436e-05, "loss": 0.0029, "num_tokens": 11829928.0, "reward": 2.5928571224212646, "reward_std": 0.684820830821991, "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.43153735995292664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.824999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36154431104660034, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.2748570374469655, "frac_reward_zero_std": 1.0, "grad_norm": 0.44921875, "kl": 0.0647916910238564, "learning_rate": 1.8303626306084822e-05, "loss": 0.0026, "num_tokens": 11835874.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2750415052573326, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.0712520512752235, "learning_rate": 1.8315918869084204e-05, "loss": 0.0029, "num_tokens": 11840429.0, "reward": 2.1145834922790527, "reward_std": 0.918221116065979, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4895833432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4765065908432007, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.2752259730676997, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.08973398199304938, "learning_rate": 1.832821143208359e-05, "loss": 0.0036, "num_tokens": 11848695.0, "reward": 1.6826924085617065, "reward_std": 0.19620589911937714, "rewards/fixed_code_pass_all_test_reward/mean": 0.557692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.20864656567573547, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 199.375, "completions/mean_terminated_length": 199.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.27541044087806676, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.07710570702329278, "learning_rate": 1.8340503995082976e-05, "loss": 0.0031, "num_tokens": 11854586.0, "reward": 1.7269231081008911, "reward_std": 0.6231244206428528, "rewards/fixed_code_pass_all_test_reward/mean": 0.45192307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.2230863869190216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2750000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45276927947998047, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2755949086884339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.046806556871160865, "learning_rate": 1.8352796558082362e-05, "loss": 0.0019, "num_tokens": 11861945.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 128.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.27577937649880097, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.06620435463264585, "learning_rate": 1.836508912108175e-05, "loss": 0.0026, "num_tokens": 11865868.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 316.875, "completions/mean_terminated_length": 316.875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.27596384430916804, "frac_reward_zero_std": 0.0, "grad_norm": 0.66796875, "kl": 0.023914585472084582, "learning_rate": 1.837738168408113e-05, "loss": 0.001, "num_tokens": 11872723.0, "reward": 2.586111068725586, "reward_std": 0.2387319654226303, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5861111283302307, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2387319654226303, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 363.125, "completions/mean_terminated_length": 363.125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.27614831211953517, "frac_reward_zero_std": 0.0, "grad_norm": 0.625, "kl": 0.0389128508977592, "learning_rate": 1.8389674247080517e-05, "loss": 0.0016, "num_tokens": 11880652.0, "reward": 1.5125000476837158, "reward_std": 0.7039429545402527, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4955156147480011, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.27633277992990224, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.1410667598247528, "learning_rate": 1.8401966810079903e-05, "loss": 0.0056, "num_tokens": 11888477.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.2765172477402693, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.10571246221661568, "learning_rate": 1.841425937307929e-05, "loss": 0.0042, "num_tokens": 11892502.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.27670171555063644, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.08740396681241691, "learning_rate": 1.8426551936078675e-05, "loss": 0.0035, "num_tokens": 11902821.0, "reward": 2.5833332538604736, "reward_std": 0.3128635287284851, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3128635585308075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2768861833610035, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.04893564130179584, "learning_rate": 1.8438844499078058e-05, "loss": 0.002, "num_tokens": 11906980.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.2770706511713706, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.08299006707966328, "learning_rate": 1.8451137062077444e-05, "loss": 0.0033, "num_tokens": 11912214.0, "reward": 2.4791667461395264, "reward_std": 0.3012869358062744, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.24800792336463928, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6041666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.2772551189817377, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.05648953630588949, "learning_rate": 1.846342962507683e-05, "loss": 0.0023, "num_tokens": 11920697.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2774395867921048, "frac_reward_zero_std": 1.0, "grad_norm": 2.015625, "kl": 0.20807561930269003, "learning_rate": 1.8475722188076216e-05, "loss": 0.0083, "num_tokens": 11928129.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.27762405460247186, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.10399617534130812, "learning_rate": 1.84880147510756e-05, "loss": 0.0042, "num_tokens": 11938434.0, "reward": 1.769230842590332, "reward_std": 0.25759798288345337, "rewards/fixed_code_pass_all_test_reward/mean": 0.7692307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.25759798288345337, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.27780852241283893, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.06465249368920922, "learning_rate": 1.8500307314074984e-05, "loss": 0.0026, "num_tokens": 11947126.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.27799299022320606, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.09773002425208688, "learning_rate": 1.851259987707437e-05, "loss": 0.0039, "num_tokens": 11953536.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 116.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.27817745803357313, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.12073469813913107, "learning_rate": 1.8524892440073756e-05, "loss": 0.0048, "num_tokens": 11959958.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.2783619258439402, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.04933562804944813, "learning_rate": 1.8537185003073142e-05, "loss": 0.002, "num_tokens": 11968997.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.27854639365430733, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.09152246592566371, "learning_rate": 1.8549477566072528e-05, "loss": 0.0037, "num_tokens": 11973771.0, "reward": 2.46875, "reward_std": 1.0559144020080566, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.84375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35197150707244873, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.2787308614646744, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.06116507248952985, "learning_rate": 1.856177012907191e-05, "loss": 0.0024, "num_tokens": 11978975.0, "reward": 2.5875000953674316, "reward_std": 0.4290771186351776, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8374999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25035688281059265, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.2789153292750415, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.0791391022503376, "learning_rate": 1.8574062692071297e-05, "loss": 0.0032, "num_tokens": 11987704.0, "reward": 2.037356376647949, "reward_std": 0.6095748543739319, "rewards/fixed_code_pass_all_test_reward/mean": 0.28735631704330444, "rewards/fixed_code_pass_all_test_reward/std": 0.09753197431564331, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.2790997970854086, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.10017428477294743, "learning_rate": 1.8586355255070683e-05, "loss": 0.004, "num_tokens": 11992949.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.2792842648957757, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.13523700926452875, "learning_rate": 1.859864781807007e-05, "loss": 0.0054, "num_tokens": 12000840.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.27946873270614275, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.04960720078088343, "learning_rate": 1.8610940381069455e-05, "loss": 0.002, "num_tokens": 12005770.0, "reward": 1.100000023841858, "reward_std": 0.15118582546710968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.10000000149011612, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1511857956647873, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2796532005165099, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.0899674529209733, "learning_rate": 1.862323294406884e-05, "loss": 0.0036, "num_tokens": 12012404.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.27983766832687695, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.09692036639899015, "learning_rate": 1.8635525507068227e-05, "loss": 0.0039, "num_tokens": 12016376.0, "reward": 2.3000001907348633, "reward_std": 0.5952190160751343, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6749999523162842, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2815771996974945, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.280022136137244, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.07731152698397636, "learning_rate": 1.8647818070067613e-05, "loss": 0.0031, "num_tokens": 12020015.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.28020660394761115, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.26038848608732224, "learning_rate": 1.8660110633066995e-05, "loss": 0.0104, "num_tokens": 12027959.0, "reward": 1.3457446098327637, "reward_std": 0.43704044818878174, "rewards/fixed_code_pass_all_test_reward/mean": 0.34574466943740845, "rewards/fixed_code_pass_all_test_reward/std": 0.43704044818878174, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.2803910717579782, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.11141340807080269, "learning_rate": 1.867240319606638e-05, "loss": 0.0045, "num_tokens": 12035158.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2805755395683453, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.09447638923302293, "learning_rate": 1.8684695759065767e-05, "loss": 0.0038, "num_tokens": 12044282.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 109.375, "completions/mean_terminated_length": 109.375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.28076000737871243, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.11470020096749067, "learning_rate": 1.8696988322065153e-05, "loss": 0.0046, "num_tokens": 12048205.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 243.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2809444751890795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.05496544414199889, "learning_rate": 1.870928088506454e-05, "loss": 0.0022, "num_tokens": 12054529.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.2811289429994466, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.10254763904958963, "learning_rate": 1.8721573448063922e-05, "loss": 0.0041, "num_tokens": 12064585.0, "reward": 2.29356050491333, "reward_std": 0.45791947841644287, "rewards/fixed_code_pass_all_test_reward/mean": 0.33522728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.42566534876823425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.2813134108098137, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0437618438154459, "learning_rate": 1.8733866011063308e-05, "loss": 0.0018, "num_tokens": 12071060.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.2814978786201808, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.048319410998374224, "learning_rate": 1.8746158574062694e-05, "loss": 0.0019, "num_tokens": 12078492.0, "reward": 1.5, "reward_std": 0.34503281116485596, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.28168234643054785, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.06256037461571395, "learning_rate": 1.875845113706208e-05, "loss": 0.0025, "num_tokens": 12083285.0, "reward": 2.140625, "reward_std": 0.5499898791313171, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.265625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3187412619590759, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.281866814240915, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.07209006184712052, "learning_rate": 1.8770743700061466e-05, "loss": 0.0029, "num_tokens": 12089889.0, "reward": 2.625, "reward_std": 1.0606601238250732, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 108.25, "completions/mean_terminated_length": 108.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.28205128205128205, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.1404612180776894, "learning_rate": 1.878303626306085e-05, "loss": 0.0056, "num_tokens": 12094323.0, "reward": 2.9613094329833984, "reward_std": 0.07192295789718628, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.961309552192688, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0719228982925415, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 111.625, "completions/mean_terminated_length": 111.625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.2822357498616491, "frac_reward_zero_std": 0.0, "grad_norm": 3.515625, "kl": 0.2081840094178915, "learning_rate": 1.8795328826060235e-05, "loss": 0.0083, "num_tokens": 12100952.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 297.5, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.28242021767201625, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.06572046969085932, "learning_rate": 1.880762138905962e-05, "loss": 0.0026, "num_tokens": 12109588.0, "reward": 2.4583332538604736, "reward_std": 0.24800804257392883, "rewards/fixed_code_pass_all_test_reward/mean": 0.4583333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.24800795316696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2826046854823833, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.07586927921511233, "learning_rate": 1.8819913952059007e-05, "loss": 0.003, "num_tokens": 12114159.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.2827891532927504, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.07752114604227245, "learning_rate": 1.8832206515058393e-05, "loss": 0.0031, "num_tokens": 12121833.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 192.25, "completions/mean_terminated_length": 192.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2829736211031175, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.04007907025516033, "learning_rate": 1.8844499078057775e-05, "loss": 0.0016, "num_tokens": 12126891.0, "reward": 2.856250047683716, "reward_std": 0.1801537126302719, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.856249988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18015369772911072, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2831580889134846, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.07346141198650002, "learning_rate": 1.885679164105716e-05, "loss": 0.0029, "num_tokens": 12134635.0, "reward": 2.8153042793273926, "reward_std": 0.13847309350967407, "rewards/fixed_code_pass_all_test_reward/mean": 0.8569711446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.05779239162802696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.28334255672385167, "frac_reward_zero_std": 1.0, "grad_norm": 0.267578125, "kl": 0.08676804788410664, "learning_rate": 1.8869084204056547e-05, "loss": 0.0035, "num_tokens": 12139405.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2835270245342188, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.1580574018880725, "learning_rate": 1.8881376767055933e-05, "loss": 0.0063, "num_tokens": 12161301.0, "reward": 1.7078313827514648, "reward_std": 0.2378310114145279, "rewards/fixed_code_pass_all_test_reward/mean": 0.5828313231468201, "rewards/fixed_code_pass_all_test_reward/std": 0.15330620110034943, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 93.25, "completions/mean_terminated_length": 93.25, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.28371149234458587, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.07399769336916506, "learning_rate": 1.8893669330055316e-05, "loss": 0.003, "num_tokens": 12164871.0, "reward": 2.674999952316284, "reward_std": 0.46521881222724915, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.675000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4652188718318939, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.28389596015495294, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.08235473930835724, "learning_rate": 1.8905961893054702e-05, "loss": 0.0033, "num_tokens": 12172737.0, "reward": 2.0416667461395264, "reward_std": 0.11785121262073517, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 134.5, "completions/mean_terminated_length": 134.5, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.28408042796532007, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.09529830608516932, "learning_rate": 1.8918254456054088e-05, "loss": 0.0038, "num_tokens": 12176869.0, "reward": 2.28125, "reward_std": 0.41052013635635376, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.28125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.41052016615867615, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 447.5, "completions/mean_terminated_length": 447.5, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.28426489577568714, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.04335727612487972, "learning_rate": 1.8930547019053474e-05, "loss": 0.0017, "num_tokens": 12190393.0, "reward": 1.5625, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4955156147480011, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 496.625, "completions/mean_terminated_length": 496.625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.2844493635860542, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.03588163969106972, "learning_rate": 1.894283958205286e-05, "loss": 0.0014, "num_tokens": 12205302.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 191.875, "completions/mean_terminated_length": 191.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.28463383139642134, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.09716257732361555, "learning_rate": 1.8955132145052242e-05, "loss": 0.0039, "num_tokens": 12211309.0, "reward": 1.75, "reward_std": 0.3333558440208435, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.3333558440208435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 93.25, "completions/mean_terminated_length": 93.25, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.2848182992067884, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.127746787853539, "learning_rate": 1.896742470805163e-05, "loss": 0.0051, "num_tokens": 12220047.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 265.875, "completions/mean_terminated_length": 265.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.2850027670171555, "frac_reward_zero_std": 0.0, "grad_norm": 0.66015625, "kl": 0.04327067406848073, "learning_rate": 1.8979717271051014e-05, "loss": 0.0017, "num_tokens": 12226974.0, "reward": 1.1477272510528564, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 275.25, "completions/mean_terminated_length": 275.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2851872348275226, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.04663902637548745, "learning_rate": 1.89920098340504e-05, "loss": 0.0019, "num_tokens": 12234720.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 312.25, "completions/mean_terminated_length": 312.25, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.2853717026378897, "frac_reward_zero_std": 0.0, "grad_norm": 0.703125, "kl": 0.06496414984576404, "learning_rate": 1.9004302397049786e-05, "loss": 0.0026, "num_tokens": 12244258.0, "reward": 2.7750000953674316, "reward_std": 0.3240370750427246, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8999999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2828427255153656, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.28555617044825676, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.04426635312847793, "learning_rate": 1.9016594960049172e-05, "loss": 0.0018, "num_tokens": 12252374.0, "reward": 2.019230842590332, "reward_std": 0.29184532165527344, "rewards/fixed_code_pass_all_test_reward/mean": 0.7692307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.324530690908432, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15430335700511932, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 68.625, "completions/mean_terminated_length": 68.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.2857406382586239, "frac_reward_zero_std": 1.0, "grad_norm": 0.80078125, "kl": 0.17562317103147507, "learning_rate": 1.902888752304856e-05, "loss": 0.007, "num_tokens": 12255859.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.28592510606899096, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.10275443317368627, "learning_rate": 1.9041180086047944e-05, "loss": 0.0041, "num_tokens": 12265218.0, "reward": 1.5416667461395264, "reward_std": 0.7753646969795227, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 550.5, "completions/mean_terminated_length": 550.5, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.28610957387935804, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.021960201615002006, "learning_rate": 1.9053472649047327e-05, "loss": 0.0009, "num_tokens": 12283494.0, "reward": 2.1666665077209473, "reward_std": 0.9799578189849854, "rewards/fixed_code_pass_all_test_reward/mean": 0.7291666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.4537104666233063, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3204349875450134, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.28629404168972516, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.056303240125998855, "learning_rate": 1.9065765212046713e-05, "loss": 0.0023, "num_tokens": 12289022.0, "reward": 1.3684210777282715, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3684210479259491, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.28647850950009224, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.07266494317445904, "learning_rate": 1.90780577750461e-05, "loss": 0.0029, "num_tokens": 12293367.0, "reward": 1.9979166984558105, "reward_std": 0.436475545167923, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12291666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1743326336145401, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.2866629773104593, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.05023109703324735, "learning_rate": 1.9090350338045485e-05, "loss": 0.002, "num_tokens": 12300990.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.28684744512082644, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.06358591839671135, "learning_rate": 1.910264290104487e-05, "loss": 0.0025, "num_tokens": 12310933.0, "reward": 1.774999976158142, "reward_std": 0.4449005424976349, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.12567278742790222, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 330.375, "completions/mean_terminated_length": 330.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.2870319129311935, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.035262564197182655, "learning_rate": 1.9114935464044254e-05, "loss": 0.0014, "num_tokens": 12318752.0, "reward": 1.2916667461395264, "reward_std": 0.3753305673599243, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.37533053755760193, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 237.25, "completions/mean_terminated_length": 237.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.2872163807415606, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.05133911105804145, "learning_rate": 1.912722802704364e-05, "loss": 0.0021, "num_tokens": 12327698.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2874008485519277, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.07435206091031432, "learning_rate": 1.9139520590043026e-05, "loss": 0.003, "num_tokens": 12336878.0, "reward": 1.673076868057251, "reward_std": 0.350702702999115, "rewards/fixed_code_pass_all_test_reward/mean": 0.6730769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.3507027328014374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 267.125, "completions/mean_terminated_length": 267.125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.2875853163622948, "frac_reward_zero_std": 0.0, "grad_norm": 0.69140625, "kl": 0.04830144334118813, "learning_rate": 1.9151813153042412e-05, "loss": 0.0019, "num_tokens": 12348759.0, "reward": 2.765625, "reward_std": 0.3499840497970581, "rewards/fixed_code_pass_all_test_reward/mean": 0.890625, "rewards/fixed_code_pass_all_test_reward/std": 0.16952534019947052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.28776978417266186, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.08023297227919102, "learning_rate": 1.9164105716041798e-05, "loss": 0.0032, "num_tokens": 12356677.0, "reward": 2.203125, "reward_std": 0.9491831660270691, "rewards/fixed_code_pass_all_test_reward/mean": 0.453125, "rewards/fixed_code_pass_all_test_reward/std": 0.3768555223941803, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.287954251983029, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.06864577136002481, "learning_rate": 1.917639827904118e-05, "loss": 0.0027, "num_tokens": 12360581.0, "reward": 2.9791665077209473, "reward_std": 0.058925628662109375, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9791666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 74.625, "completions/mean_terminated_length": 74.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.28813871979339606, "frac_reward_zero_std": 0.0, "grad_norm": 4.625, "kl": 0.1432599751278758, "learning_rate": 1.9188690842040566e-05, "loss": 0.0057, "num_tokens": 12364082.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.28832318760376313, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.03487297846004367, "learning_rate": 1.9200983405039952e-05, "loss": 0.0014, "num_tokens": 12370772.0, "reward": 1.9375, "reward_std": 0.4172614812850952, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 92.625, "completions/mean_terminated_length": 92.625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.28850765541413026, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.10655389761086553, "learning_rate": 1.921327596803934e-05, "loss": 0.0043, "num_tokens": 12375441.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 417.125, "completions/mean_terminated_length": 417.125, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.28869212322449733, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.02254857635125518, "learning_rate": 1.9225568531038724e-05, "loss": 0.0009, "num_tokens": 12388170.0, "reward": 2.201923131942749, "reward_std": 0.4863956570625305, "rewards/fixed_code_pass_all_test_reward/mean": 0.32692307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.2900141477584839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.2888765910348644, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.13988269865512848, "learning_rate": 1.9237861094038107e-05, "loss": 0.0056, "num_tokens": 12394696.0, "reward": 1.960185170173645, "reward_std": 0.3178086578845978, "rewards/fixed_code_pass_all_test_reward/mean": 0.7685185074806213, "rewards/fixed_code_pass_all_test_reward/std": 0.19168488681316376, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.19166666269302368, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22590208053588867, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.28906105884523153, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.05068887793458998, "learning_rate": 1.9250153657037493e-05, "loss": 0.002, "num_tokens": 12406227.0, "reward": 2.237903356552124, "reward_std": 0.25571373105049133, "rewards/fixed_code_pass_all_test_reward/mean": 0.23790322244167328, "rewards/fixed_code_pass_all_test_reward/std": 0.25571367144584656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.2892455266555986, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.0435558226890862, "learning_rate": 1.926244622003688e-05, "loss": 0.0017, "num_tokens": 12417252.0, "reward": 2.549999952316284, "reward_std": 0.4869731664657593, "rewards/fixed_code_pass_all_test_reward/mean": 0.550000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.4869731664657593, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.2894299944659657, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.08344120625406504, "learning_rate": 1.9274738783036265e-05, "loss": 0.0033, "num_tokens": 12422547.0, "reward": 2.625, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 103.0, "completions/mean_terminated_length": 103.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.2896144622763328, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.08431837754324079, "learning_rate": 1.928703134603565e-05, "loss": 0.0034, "num_tokens": 12426227.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.2897989300866999, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.05406241083983332, "learning_rate": 1.9299323909035034e-05, "loss": 0.0022, "num_tokens": 12437559.0, "reward": 2.25, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.4432026445865631, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 134.5, "completions/mean_terminated_length": 134.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.28998339789706695, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.07228834019042552, "learning_rate": 1.931161647203442e-05, "loss": 0.0029, "num_tokens": 12444323.0, "reward": 2.625, "reward_std": 1.0606601238250732, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 180.125, "completions/mean_terminated_length": 180.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.290167865707434, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0456280407961458, "learning_rate": 1.9323909035033806e-05, "loss": 0.0018, "num_tokens": 12453132.0, "reward": 2.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 89.125, "completions/mean_terminated_length": 89.125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.29035233351780115, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.1347931483760476, "learning_rate": 1.933620159803319e-05, "loss": 0.0054, "num_tokens": 12459589.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 108.375, "completions/mean_terminated_length": 108.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.2905368013281682, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.08701448887586594, "learning_rate": 1.9348494161032578e-05, "loss": 0.0035, "num_tokens": 12463600.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2907212691385353, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.08067146828398108, "learning_rate": 1.936078672403196e-05, "loss": 0.0032, "num_tokens": 12472508.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2909057369489024, "frac_reward_zero_std": 1.0, "grad_norm": 0.328125, "kl": 0.1365876216441393, "learning_rate": 1.9373079287031346e-05, "loss": 0.0055, "num_tokens": 12479524.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.2910902047592695, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.06615155143663287, "learning_rate": 1.9385371850030732e-05, "loss": 0.0026, "num_tokens": 12486505.0, "reward": 1.4500000476837158, "reward_std": 0.141421377658844, "rewards/fixed_code_pass_all_test_reward/mean": 0.44999998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.2912746725696366, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.06343846768140793, "learning_rate": 1.9397664413030118e-05, "loss": 0.0025, "num_tokens": 12494992.0, "reward": 2.3125, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 404.125, "completions/mean_terminated_length": 404.125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.2914591403800037, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.03705636388622224, "learning_rate": 1.9409956976029504e-05, "loss": 0.0015, "num_tokens": 12507417.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.2916436081903708, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.05218057497404516, "learning_rate": 1.942224953902889e-05, "loss": 0.0021, "num_tokens": 12519205.0, "reward": 1.7672269344329834, "reward_std": 0.17068980634212494, "rewards/fixed_code_pass_all_test_reward/mean": 0.06722689419984818, "rewards/fixed_code_pass_all_test_reward/std": 0.11030054837465286, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.699999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0963624119758606, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 97.875, "completions/mean_terminated_length": 97.875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.29182807600073785, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.13225928787142038, "learning_rate": 1.9434542102028276e-05, "loss": 0.0053, "num_tokens": 12522812.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.292012543811105, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.11062068957835436, "learning_rate": 1.9446834665027662e-05, "loss": 0.0044, "num_tokens": 12531177.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.29219701162147205, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.06714000971987844, "learning_rate": 1.9459127228027045e-05, "loss": 0.0027, "num_tokens": 12540008.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 505.875, "completions/mean_terminated_length": 505.875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.2923814794318391, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.03459054185077548, "learning_rate": 1.947141979102643e-05, "loss": 0.0014, "num_tokens": 12550359.0, "reward": 1.3214285373687744, "reward_std": 0.29326748847961426, "rewards/fixed_code_pass_all_test_reward/mean": 0.3214285671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.29326751828193665, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 217.75, "completions/mean_terminated_length": 217.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.29256594724220625, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.042278224835172296, "learning_rate": 1.9483712354025817e-05, "loss": 0.0017, "num_tokens": 12558925.0, "reward": 2.8612637519836426, "reward_std": 0.2570406496524811, "rewards/fixed_code_pass_all_test_reward/mean": 0.932692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.19037489593029022, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9285714626312256, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2020305097103119, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.2927504150525733, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.04989356896840036, "learning_rate": 1.9496004917025203e-05, "loss": 0.002, "num_tokens": 12567937.0, "reward": 2.5969388484954834, "reward_std": 0.2074791043996811, "rewards/fixed_code_pass_all_test_reward/mean": 0.5969387888908386, "rewards/fixed_code_pass_all_test_reward/std": 0.20747901499271393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 848.375, "completions/mean_terminated_length": 848.375, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 0.2929348828629404, "frac_reward_zero_std": 0.0, "grad_norm": 0.59375, "kl": 0.022408646997064352, "learning_rate": 1.950829748002459e-05, "loss": 0.0009, "num_tokens": 12583452.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.2931193506733075, "frac_reward_zero_std": 1.0, "grad_norm": 0.04736328125, "kl": 0.041000147350132465, "learning_rate": 1.952059004302397e-05, "loss": 0.0016, "num_tokens": 12594197.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.2933038184836746, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.06691160332411528, "learning_rate": 1.9532882606023357e-05, "loss": 0.0027, "num_tokens": 12603655.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.29348828629404167, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.07052203826606274, "learning_rate": 1.9545175169022743e-05, "loss": 0.0028, "num_tokens": 12612343.0, "reward": 1.969202995300293, "reward_std": 0.47828835248947144, "rewards/fixed_code_pass_all_test_reward/mean": 0.510869562625885, "rewards/fixed_code_pass_all_test_reward/std": 0.4226570129394531, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4583333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 119.875, "completions/mean_terminated_length": 119.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.2936727541044088, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.11097941920161247, "learning_rate": 1.955746773202213e-05, "loss": 0.0044, "num_tokens": 12619126.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.29385722191477587, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.10383147094398737, "learning_rate": 1.9569760295021515e-05, "loss": 0.0042, "num_tokens": 12627479.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.29404168972514294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.07451951550319791, "learning_rate": 1.9582052858020898e-05, "loss": 0.003, "num_tokens": 12636698.0, "reward": 2.5357141494750977, "reward_std": 0.49634233117103577, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.49634233117103577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 388.125, "completions/mean_terminated_length": 388.125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.29422615753551007, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.033975063590332866, "learning_rate": 1.9594345421020284e-05, "loss": 0.0014, "num_tokens": 12644579.0, "reward": 1.816249966621399, "reward_std": 0.43273013830184937, "rewards/fixed_code_pass_all_test_reward/mean": 0.9099999666213989, "rewards/fixed_code_pass_all_test_reward/std": 0.2545584738254547, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 78.625, "completions/mean_terminated_length": 78.625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.29441062534587714, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.1232863999903202, "learning_rate": 1.960663798401967e-05, "loss": 0.0049, "num_tokens": 12648192.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2945950931562442, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.04258407140150666, "learning_rate": 1.9618930547019056e-05, "loss": 0.0017, "num_tokens": 12655551.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.29477956096661134, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.057602216489613056, "learning_rate": 1.963122311001844e-05, "loss": 0.0023, "num_tokens": 12665870.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2949640287769784, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.06031497893854976, "learning_rate": 1.9643515673017825e-05, "loss": 0.0024, "num_tokens": 12670913.0, "reward": 1.933333396911621, "reward_std": 0.454955130815506, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5583333373069763, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4766783118247986, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 249.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.2951484965873455, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.07196444645524025, "learning_rate": 1.965580823601721e-05, "loss": 0.0029, "num_tokens": 12682047.0, "reward": 2.2136363983154297, "reward_std": 0.2777461111545563, "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8500000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2777460217475891, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.2953329643977126, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.09434425085783005, "learning_rate": 1.9668100799016597e-05, "loss": 0.0038, "num_tokens": 12686129.0, "reward": 2.9583334922790527, "reward_std": 0.11785107105970383, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.2955174322080797, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.060736874118447304, "learning_rate": 1.9680393362015983e-05, "loss": 0.0024, "num_tokens": 12692623.0, "reward": 1.5328296422958374, "reward_std": 0.39406880736351013, "rewards/fixed_code_pass_all_test_reward/mean": 0.2953296899795532, "rewards/fixed_code_pass_all_test_reward/std": 0.28844472765922546, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.23749999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3700868785381317, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 82.75, "completions/mean_terminated_length": 82.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.29570190001844676, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.17092635855078697, "learning_rate": 1.9692685925015365e-05, "loss": 0.0068, "num_tokens": 12696101.0, "reward": 1.0729167461395264, "reward_std": 0.1368400603532791, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.2958863678288139, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.0784892737865448, "learning_rate": 1.970497848801475e-05, "loss": 0.0031, "num_tokens": 12700175.0, "reward": 1.5208333730697632, "reward_std": 0.46664541959762573, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5208333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.46664541959762573, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.29607083563918096, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.0611266428604722, "learning_rate": 1.9717271051014137e-05, "loss": 0.0024, "num_tokens": 12705675.0, "reward": 1.2249999046325684, "reward_std": 0.4949747323989868, "rewards/fixed_code_pass_all_test_reward/mean": 0.3500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 438.25, "completions/mean_terminated_length": 438.25, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.29625530344954804, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.033239323645830154, "learning_rate": 1.9729563614013523e-05, "loss": 0.0013, "num_tokens": 12714437.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 72.625, "completions/mean_terminated_length": 72.625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.29643977125991516, "frac_reward_zero_std": 1.0, "grad_norm": 0.1767578125, "kl": 0.1369410203769803, "learning_rate": 1.974185617701291e-05, "loss": 0.0055, "num_tokens": 12717746.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 245.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.29662423907028224, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.06340020010247827, "learning_rate": 1.9754148740012292e-05, "loss": 0.0025, "num_tokens": 12730023.0, "reward": 1.8523809909820557, "reward_std": 0.28888794779777527, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.29407793283462524, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3166666626930237, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0471404567360878, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 520.625, "completions/mean_terminated_length": 520.625, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.2968087068806493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0245361328125, "kl": 0.020887983264401555, "learning_rate": 1.9766441303011678e-05, "loss": 0.0008, "num_tokens": 12743260.0, "reward": 1.9506173133850098, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9506173133850098, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.29699317469101644, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.04820909630507231, "learning_rate": 1.9778733866011064e-05, "loss": 0.0019, "num_tokens": 12753807.0, "reward": 1.638888955116272, "reward_std": 0.6675479412078857, "rewards/fixed_code_pass_all_test_reward/mean": 0.472222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.20857571065425873, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.2971776425013835, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "kl": 0.27193159563466907, "learning_rate": 1.979102642901045e-05, "loss": 0.0109, "num_tokens": 12760872.0, "reward": 2.4375, "reward_std": 0.47715675830841064, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.4381372928619385, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.2973621103117506, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.05683129350654781, "learning_rate": 1.9803318992009836e-05, "loss": 0.0023, "num_tokens": 12769031.0, "reward": 2.950000047683716, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.949999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213478565216, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.2975465781221177, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.07954602688550949, "learning_rate": 1.9815611555009222e-05, "loss": 0.0032, "num_tokens": 12777219.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.2977310459324848, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.05172847374342382, "learning_rate": 1.9827904118008608e-05, "loss": 0.0021, "num_tokens": 12789114.0, "reward": 2.3802082538604736, "reward_std": 0.0289318785071373, "rewards/fixed_code_pass_all_test_reward/mean": 0.3802083432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0289318785071373, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.29791551374285186, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.10649146279320121, "learning_rate": 1.9840196681007994e-05, "loss": 0.0043, "num_tokens": 12793840.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 104.25, "completions/mean_terminated_length": 104.25, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.298099981553219, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.09939770679920912, "learning_rate": 1.9852489244007376e-05, "loss": 0.004, "num_tokens": 12797474.0, "reward": 2.852083444595337, "reward_std": 0.13015176355838776, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8520833253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13015177845954895, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.29828444936358606, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.21363095799461007, "learning_rate": 1.9864781807006762e-05, "loss": 0.0085, "num_tokens": 12801851.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 435.25, "completions/mean_terminated_length": 435.25, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.29846891717395313, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.1407157313078642, "learning_rate": 1.987707437000615e-05, "loss": 0.0056, "num_tokens": 12814901.0, "reward": 2.6875, "reward_std": 0.5938674807548523, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.29865338498432026, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.05994216096587479, "learning_rate": 1.9889366933005534e-05, "loss": 0.0024, "num_tokens": 12819413.0, "reward": 1.2000000476837158, "reward_std": 0.33806169033050537, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.29883785279468733, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.047097194474190474, "learning_rate": 1.990165949600492e-05, "loss": 0.0019, "num_tokens": 12825616.0, "reward": 1.920161247253418, "reward_std": 0.3145155608654022, "rewards/fixed_code_pass_all_test_reward/mean": 0.8951612710952759, "rewards/fixed_code_pass_all_test_reward/std": 0.2965286672115326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 360.375, "completions/mean_terminated_length": 360.375, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.2990223206050544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.043374989880248904, "learning_rate": 1.9913952059004303e-05, "loss": 0.0017, "num_tokens": 12836595.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 130.125, "completions/mean_terminated_length": 130.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.29920678841542153, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.0859137149527669, "learning_rate": 1.992624462200369e-05, "loss": 0.0034, "num_tokens": 12840548.0, "reward": 2.9749999046325684, "reward_std": 0.07071065902709961, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9750000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106739282608, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 704.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.2993912562257886, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.163454279943835, "learning_rate": 1.9938537185003075e-05, "loss": 0.0065, "num_tokens": 12853300.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2995757240361557, "frac_reward_zero_std": 0.0, "grad_norm": 9.0, "kl": 0.8109235377050936, "learning_rate": 1.995082974800246e-05, "loss": 0.0324, "num_tokens": 12860324.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.2997601918465228, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.05822183447889984, "learning_rate": 1.9963122311001847e-05, "loss": 0.0023, "num_tokens": 12865943.0, "reward": 2.578125, "reward_std": 0.5129462480545044, "rewards/fixed_code_pass_all_test_reward/mean": 0.703125, "rewards/fixed_code_pass_all_test_reward/std": 0.2106272578239441, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.2999446596568899, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.06511215027421713, "learning_rate": 1.997541487400123e-05, "loss": 0.0026, "num_tokens": 12871425.0, "reward": 2.25, "reward_std": 1.3887301683425903, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 108.625, "completions/mean_terminated_length": 108.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.30012912746725695, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.06214003823697567, "learning_rate": 1.9987707437000616e-05, "loss": 0.0025, "num_tokens": 12878070.0, "reward": 2.75, "reward_std": 0.15430335700511932, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.3003135952776241, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.060465916991233826, "learning_rate": 2e-05, "loss": 0.0024, "num_tokens": 12884509.0, "reward": 1.829326868057251, "reward_std": 0.36854293942451477, "rewards/fixed_code_pass_all_test_reward/mean": 0.9230769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.07121692597866058, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 112.625, "completions/mean_terminated_length": 112.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.30049806308799115, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.13166926056146622, "learning_rate": 1.999999976963052e-05, "loss": 0.0053, "num_tokens": 12893050.0, "reward": 2.5625, "reward_std": 0.6232117414474487, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 84.75, "completions/mean_terminated_length": 84.75, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.3006825308983582, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.07996613951399922, "learning_rate": 1.9999999078522084e-05, "loss": 0.0032, "num_tokens": 12899016.0, "reward": 2.0, "reward_std": 1.0690449476242065, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.30086699870872535, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.07110021216794848, "learning_rate": 1.9999997926674726e-05, "loss": 0.0028, "num_tokens": 12907558.0, "reward": 2.3581349849700928, "reward_std": 0.294719398021698, "rewards/fixed_code_pass_all_test_reward/mean": 0.920634925365448, "rewards/fixed_code_pass_all_test_reward/std": 0.14695556461811066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.3010514665190924, "frac_reward_zero_std": 1.0, "grad_norm": 0.1875, "kl": 0.06566801527515054, "learning_rate": 1.9999996314088498e-05, "loss": 0.0026, "num_tokens": 12912681.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 115.125, "completions/mean_terminated_length": 115.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3012359343294595, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.20211856625974178, "learning_rate": 1.9999994240763473e-05, "loss": 0.0081, "num_tokens": 12920738.0, "reward": 2.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.3014204021398266, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.05807845131494105, "learning_rate": 1.999999170669975e-05, "loss": 0.0023, "num_tokens": 12932728.0, "reward": 2.0416665077209473, "reward_std": 0.9473060369491577, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.44828587770462036, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.37796446681022644, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3016048699501937, "frac_reward_zero_std": 1.0, "grad_norm": 0.046630859375, "kl": 0.03550087031908333, "learning_rate": 1.9999988711897445e-05, "loss": 0.0014, "num_tokens": 12939224.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 216.75, "completions/mean_terminated_length": 216.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.30178933776056077, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.042324725072830915, "learning_rate": 1.9999985256356697e-05, "loss": 0.0017, "num_tokens": 12946718.0, "reward": 2.6875, "reward_std": 0.5938674807548523, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3019738055709279, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.05968584166839719, "learning_rate": 1.9999981340077663e-05, "loss": 0.0024, "num_tokens": 12957164.0, "reward": 2.2291665077209473, "reward_std": 0.48121458292007446, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.3922867476940155, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4955156147480011, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.302158273381295, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.035312173422425985, "learning_rate": 1.9999976963060523e-05, "loss": 0.0014, "num_tokens": 12962971.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 402.125, "completions/mean_terminated_length": 402.125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.30234274119166205, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.02226614230312407, "learning_rate": 1.999997212530548e-05, "loss": 0.0009, "num_tokens": 12977116.0, "reward": 2.3359375, "reward_std": 0.6667073965072632, "rewards/fixed_code_pass_all_test_reward/mean": 0.9609375, "rewards/fixed_code_pass_all_test_reward/std": 0.11048544198274612, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4364357888698578, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3025272090020291, "frac_reward_zero_std": 1.0, "grad_norm": 0.054931640625, "kl": 0.03870683046989143, "learning_rate": 1.9999966826812755e-05, "loss": 0.0015, "num_tokens": 12985170.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.30271167681239625, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.025099253514781594, "learning_rate": 1.9999961067582595e-05, "loss": 0.001, "num_tokens": 12993376.0, "reward": 2.861842155456543, "reward_std": 0.1906760185956955, "rewards/fixed_code_pass_all_test_reward/mean": 0.8618420958518982, "rewards/fixed_code_pass_all_test_reward/std": 0.1906760334968567, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3028961446227633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.09411798231303692, "learning_rate": 1.9999954847615268e-05, "loss": 0.0038, "num_tokens": 12997355.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 206.5, "completions/mean_terminated_length": 206.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.3030806124331304, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.1420106729492545, "learning_rate": 1.999994816691105e-05, "loss": 0.0057, "num_tokens": 13005743.0, "reward": 2.75, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.3032650802434975, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.09659285703673959, "learning_rate": 1.999994102547026e-05, "loss": 0.0039, "num_tokens": 13009705.0, "reward": 2.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 91.0, "completions/mean_terminated_length": 91.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.3034495480538646, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.11103995982557535, "learning_rate": 1.9999933423293225e-05, "loss": 0.0044, "num_tokens": 13013281.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 45.625, "completions/mean_terminated_length": 45.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.30363401586423167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.06722681783139706, "learning_rate": 1.9999925360380284e-05, "loss": 0.0027, "num_tokens": 13018278.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.3038184836745988, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.05934868939220905, "learning_rate": 1.9999916836731822e-05, "loss": 0.0024, "num_tokens": 13025461.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.30400295148496587, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.07387213688343763, "learning_rate": 1.9999907852348232e-05, "loss": 0.003, "num_tokens": 13030835.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.30418741929533294, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.02862774848472327, "learning_rate": 1.9999898407229916e-05, "loss": 0.0011, "num_tokens": 13038797.0, "reward": 1.28125, "reward_std": 0.45193037390708923, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 205.75, "completions/mean_terminated_length": 205.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.30437188710570007, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.06493422389030457, "learning_rate": 1.9999888501377316e-05, "loss": 0.0026, "num_tokens": 13044323.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.30455635491606714, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.07256479980424047, "learning_rate": 1.9999878134790892e-05, "loss": 0.0029, "num_tokens": 13048197.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 203.25, "completions/mean_terminated_length": 203.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.3047408227264342, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.04307525767944753, "learning_rate": 1.9999867307471118e-05, "loss": 0.0017, "num_tokens": 13055975.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.30492529053680134, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.03775738971307874, "learning_rate": 1.9999856019418492e-05, "loss": 0.0015, "num_tokens": 13066204.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.3051097583471684, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.06360642006620765, "learning_rate": 1.9999844270633533e-05, "loss": 0.0025, "num_tokens": 13072544.0, "reward": 1.8870967626571655, "reward_std": 0.3193385601043701, "rewards/fixed_code_pass_all_test_reward/mean": 0.8870967626571655, "rewards/fixed_code_pass_all_test_reward/std": 0.31933853030204773, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3052942261575355, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0867096777074039, "learning_rate": 1.9999832061116783e-05, "loss": 0.0035, "num_tokens": 13077872.0, "reward": 1.8125, "reward_std": 0.5303300619125366, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.3054786939679026, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.030867422465234995, "learning_rate": 1.999981939086881e-05, "loss": 0.0012, "num_tokens": 13089008.0, "reward": 1.7439320087432861, "reward_std": 0.02512373775243759, "rewards/fixed_code_pass_all_test_reward/mean": 0.24393203854560852, "rewards/fixed_code_pass_all_test_reward/std": 0.025123747065663338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 96.625, "completions/mean_terminated_length": 96.625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.3056631617782697, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.08028953988105059, "learning_rate": 1.9999806259890194e-05, "loss": 0.0032, "num_tokens": 13092669.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.30584762958863676, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.05731497053056955, "learning_rate": 1.9999792668181537e-05, "loss": 0.0023, "num_tokens": 13101083.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3060320973990039, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.06402529636397958, "learning_rate": 1.999977861574347e-05, "loss": 0.0026, "num_tokens": 13107932.0, "reward": 2.096874952316284, "reward_std": 0.18428796529769897, "rewards/fixed_code_pass_all_test_reward/mean": 0.984375, "rewards/fixed_code_pass_all_test_reward/std": 0.0289318785071373, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11249999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 62.625, "completions/mean_terminated_length": 62.625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.30621656520937096, "frac_reward_zero_std": 1.0, "grad_norm": 2.875, "kl": 0.3752208538353443, "learning_rate": 1.9999764102576638e-05, "loss": 0.015, "num_tokens": 13111057.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 398.625, "completions/mean_terminated_length": 398.625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.30640103301973803, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.031543316319584846, "learning_rate": 1.9999749128681708e-05, "loss": 0.0013, "num_tokens": 13119030.0, "reward": 1.7968254089355469, "reward_std": 0.33289653062820435, "rewards/fixed_code_pass_all_test_reward/mean": 0.6736111640930176, "rewards/fixed_code_pass_all_test_reward/std": 0.1548382043838501, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.12321428209543228, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18904972076416016, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.30658550083010516, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.1384281236678362, "learning_rate": 1.9999733694059374e-05, "loss": 0.0055, "num_tokens": 13141083.0, "reward": 2.0140562057495117, "reward_std": 0.24308450520038605, "rewards/fixed_code_pass_all_test_reward/mean": 0.6807228922843933, "rewards/fixed_code_pass_all_test_reward/std": 0.24308447539806366, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 383.125, "completions/mean_terminated_length": 383.125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.30676996864047223, "frac_reward_zero_std": 0.0, "grad_norm": 0.68359375, "kl": 0.024397484492510557, "learning_rate": 1.9999717798710345e-05, "loss": 0.001, "num_tokens": 13149724.0, "reward": 1.0680060386657715, "reward_std": 0.11776770651340485, "rewards/fixed_code_pass_all_test_reward/mean": 0.010714286006987095, "rewards/fixed_code_pass_all_test_reward/std": 0.030304577201604843, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0572916679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11980784684419632, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 78.375, "completions/mean_terminated_length": 78.375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.3069544364508393, "frac_reward_zero_std": 0.0, "grad_norm": 5.375, "kl": 0.1682533985003829, "learning_rate": 1.9999701442635353e-05, "loss": 0.0067, "num_tokens": 13153095.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 89.875, "completions/mean_terminated_length": 89.875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.30713890426120644, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.12420688453130424, "learning_rate": 1.9999684625835152e-05, "loss": 0.005, "num_tokens": 13156630.0, "reward": 2.4375, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4955156147480011, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 333.625, "completions/mean_terminated_length": 333.625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3073233720715735, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.04410346201620996, "learning_rate": 1.9999667348310516e-05, "loss": 0.0018, "num_tokens": 13163883.0, "reward": 1.2433440685272217, "reward_std": 0.6885047554969788, "rewards/fixed_code_pass_all_test_reward/mean": 0.32102271914482117, "rewards/fixed_code_pass_all_test_reward/std": 0.20416179299354553, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17232143878936768, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20654161274433136, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3075078398819406, "frac_reward_zero_std": 0.0, "grad_norm": 0.5703125, "kl": 0.032941394252702594, "learning_rate": 1.9999649610062244e-05, "loss": 0.0013, "num_tokens": 13172277.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.3076923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.049619803205132484, "learning_rate": 1.999963141109115e-05, "loss": 0.002, "num_tokens": 13179382.0, "reward": 2.633333206176758, "reward_std": 0.09428095817565918, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6333333253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0942809134721756, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3078767755026748, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.049535027239471674, "learning_rate": 1.9999612751398073e-05, "loss": 0.002, "num_tokens": 13186647.0, "reward": 2.4982995986938477, "reward_std": 0.1395920366048813, "rewards/fixed_code_pass_all_test_reward/mean": 0.8316326141357422, "rewards/fixed_code_pass_all_test_reward/std": 0.1395920068025589, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.30806124331304185, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.10780734568834305, "learning_rate": 1.999959363098387e-05, "loss": 0.0043, "num_tokens": 13190462.0, "reward": 2.125, "reward_std": 0.18898223340511322, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18898223340511322, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.308245711123409, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.048483928898349404, "learning_rate": 1.9999574049849434e-05, "loss": 0.0019, "num_tokens": 13196178.0, "reward": 1.9952380657196045, "reward_std": 0.013468711636960506, "rewards/fixed_code_pass_all_test_reward/mean": 0.9702380895614624, "rewards/fixed_code_pass_all_test_reward/std": 0.08417937159538269, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 333.75, "completions/mean_terminated_length": 333.75, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.30843017893377606, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.029393874807283282, "learning_rate": 1.999955400799565e-05, "loss": 0.0012, "num_tokens": 13208944.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 323.25, "completions/mean_terminated_length": 323.25, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.30861464674414313, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.03807128104381263, "learning_rate": 1.9999533505423453e-05, "loss": 0.0015, "num_tokens": 13221234.0, "reward": 2.1175594329833984, "reward_std": 0.7546272873878479, "rewards/fixed_code_pass_all_test_reward/mean": 0.7008928656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.4151545763015747, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34503278136253357, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 558.0, "completions/mean_terminated_length": 558.0, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.30879911455451026, "frac_reward_zero_std": 0.0, "grad_norm": 0.546875, "kl": 0.019152153632603586, "learning_rate": 1.999951254213379e-05, "loss": 0.0008, "num_tokens": 13237202.0, "reward": 2.71875, "reward_std": 0.38816189765930176, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.30898358236487733, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.08496448211371899, "learning_rate": 1.999949111812761e-05, "loss": 0.0034, "num_tokens": 13241203.0, "reward": 1.7916667461395264, "reward_std": 0.7007365226745605, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5416666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.397112637758255, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.3091680501752444, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.1348776863887906, "learning_rate": 1.999946923340592e-05, "loss": 0.0054, "num_tokens": 13251114.0, "reward": 2.2002551555633545, "reward_std": 0.9956760406494141, "rewards/fixed_code_pass_all_test_reward/mean": 0.5127550959587097, "rewards/fixed_code_pass_all_test_reward/std": 0.5219565033912659, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 564.25, "completions/mean_terminated_length": 564.25, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.30935251798561153, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.037386814365163445, "learning_rate": 1.9999446887969717e-05, "loss": 0.0015, "num_tokens": 13262004.0, "reward": 1.6919642686843872, "reward_std": 0.5090068578720093, "rewards/fixed_code_pass_all_test_reward/mean": 0.6607142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.47648802399635315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 111.125, "completions/mean_terminated_length": 111.125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.3095369857959786, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.1199882049113512, "learning_rate": 1.9999424081820035e-05, "loss": 0.0048, "num_tokens": 13268685.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 75.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3097214536063457, "frac_reward_zero_std": 0.0, "grad_norm": 3.296875, "kl": 0.1945429928600788, "learning_rate": 1.999940081495792e-05, "loss": 0.0078, "num_tokens": 13272125.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3099059214167128, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.06696378183551133, "learning_rate": 1.999937708738445e-05, "loss": 0.0027, "num_tokens": 13277721.0, "reward": 1.5871212482452393, "reward_std": 0.770187258720398, "rewards/fixed_code_pass_all_test_reward/mean": 0.6704545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.4684560000896454, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.3100903892270799, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.028612706810235977, "learning_rate": 1.9999352899100715e-05, "loss": 0.0011, "num_tokens": 13287679.0, "reward": 2.668750047683716, "reward_std": 0.2298097163438797, "rewards/fixed_code_pass_all_test_reward/mean": 0.918749988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.2298097163438797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 79.0, "completions/mean_terminated_length": 79.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.31027485703744695, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.11444060876965523, "learning_rate": 1.9999328250107826e-05, "loss": 0.0046, "num_tokens": 13291135.0, "reward": 2.9166667461395264, "reward_std": 0.15430328249931335, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15430334210395813, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 96.125, "completions/mean_terminated_length": 96.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.3104593248478141, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.11009531188756227, "learning_rate": 1.999930314040693e-05, "loss": 0.0044, "num_tokens": 13299856.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 91.625, "completions/mean_terminated_length": 91.625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.31064379265818115, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.09486155724152923, "learning_rate": 1.999927756999917e-05, "loss": 0.0038, "num_tokens": 13308549.0, "reward": 1.75, "reward_std": 0.4432026147842407, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.3108282604685482, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.0881508169695735, "learning_rate": 1.9999251538885735e-05, "loss": 0.0035, "num_tokens": 13316604.0, "reward": 2.734011650085449, "reward_std": 0.3798728585243225, "rewards/fixed_code_pass_all_test_reward/mean": 0.7965116500854492, "rewards/fixed_code_pass_all_test_reward/std": 0.37699225544929504, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.31101272827891535, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.06497808778658509, "learning_rate": 1.9999225047067817e-05, "loss": 0.0026, "num_tokens": 13324358.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 111.875, "completions/mean_terminated_length": 111.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.3111971960892824, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "kl": 0.060727793257683516, "learning_rate": 1.9999198094546635e-05, "loss": 0.0024, "num_tokens": 13335045.0, "reward": 1.682692289352417, "reward_std": 0.4001004099845886, "rewards/fixed_code_pass_all_test_reward/mean": 0.4951923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.20397312939167023, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25877460837364197, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 545.25, "completions/mean_terminated_length": 545.25, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.3113816638996495, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.030279521830379963, "learning_rate": 1.9999170681323444e-05, "loss": 0.0012, "num_tokens": 13349647.0, "reward": 1.2717947959899902, "reward_std": 0.8004506826400757, "rewards/fixed_code_pass_all_test_reward/mean": 0.2884615361690521, "rewards/fixed_code_pass_all_test_reward/std": 0.238868847489357, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.23333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1511857956647873, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 134.5, "completions/mean_terminated_length": 134.5, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.3115661317100166, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0666207738686353, "learning_rate": 1.999914280739949e-05, "loss": 0.0027, "num_tokens": 13353779.0, "reward": 1.6437500715255737, "reward_std": 0.5753492712974548, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.26874998211860657, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19074572622776031, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 405.625, "completions/mean_terminated_length": 405.625, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.3117505995203837, "frac_reward_zero_std": 1.0, "grad_norm": 0.0306396484375, "kl": 0.02024973661173135, "learning_rate": 1.9999114472776072e-05, "loss": 0.0008, "num_tokens": 13366008.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.31193506733075077, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.09074535267427564, "learning_rate": 1.9999085677454487e-05, "loss": 0.0036, "num_tokens": 13372152.0, "reward": 2.84375, "reward_std": 0.29693374037742615, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.84375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29693374037742615, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.3121195351411179, "frac_reward_zero_std": 1.0, "grad_norm": 0.0390625, "kl": 0.03962083766236901, "learning_rate": 1.9999056421436065e-05, "loss": 0.0016, "num_tokens": 13381534.0, "reward": 2.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.31230400295148497, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.06266049644909799, "learning_rate": 1.999902670472215e-05, "loss": 0.0025, "num_tokens": 13390496.0, "reward": 2.230769157409668, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.23076923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.31248847076185204, "frac_reward_zero_std": 1.0, "grad_norm": 1.109375, "kl": 0.13233327632769942, "learning_rate": 1.9998996527314115e-05, "loss": 0.0053, "num_tokens": 13397061.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3126729385722192, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.03260780870914459, "learning_rate": 1.9998965889213345e-05, "loss": 0.0013, "num_tokens": 13406747.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.31285740638258625, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.049882425693795085, "learning_rate": 1.9998934790421262e-05, "loss": 0.002, "num_tokens": 13413965.0, "reward": 1.75, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 80.875, "completions/mean_terminated_length": 80.875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3130418741929533, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.23889169236645103, "learning_rate": 1.9998903230939292e-05, "loss": 0.0096, "num_tokens": 13417436.0, "reward": 2.112499952316284, "reward_std": 0.21001699566841125, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11249999701976776, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21001701056957245, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.125, "completions/mean_terminated_length": 123.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.31322634200332045, "frac_reward_zero_std": 1.0, "grad_norm": 0.53125, "kl": 0.15762214548885822, "learning_rate": 1.999887121076889e-05, "loss": 0.0063, "num_tokens": 13424901.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.3134108098136875, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.05360007705166936, "learning_rate": 1.9998838729911526e-05, "loss": 0.0021, "num_tokens": 13432393.0, "reward": 1.305059552192688, "reward_std": 0.37999120354652405, "rewards/fixed_code_pass_all_test_reward/mean": 0.3571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.43865683674812317, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 101.875, "completions/mean_terminated_length": 101.875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.3135952776240546, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.07497143000364304, "learning_rate": 1.999880578836871e-05, "loss": 0.003, "num_tokens": 13438560.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 104.0, "completions/mean_terminated_length": 104.0, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.3137797454344217, "frac_reward_zero_std": 1.0, "grad_norm": 0.193359375, "kl": 0.15772962756454945, "learning_rate": 1.999877238614194e-05, "loss": 0.0063, "num_tokens": 13442432.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3139642132447888, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.02820848603732884, "learning_rate": 1.9998738523232776e-05, "loss": 0.0011, "num_tokens": 13449112.0, "reward": 2.001286745071411, "reward_std": 0.5098690986633301, "rewards/fixed_code_pass_all_test_reward/mean": 0.591911792755127, "rewards/fixed_code_pass_all_test_reward/std": 0.49825137853622437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.40937501192092896, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16277830302715302, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 463.0, "completions/mean_terminated_length": 463.0, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.31414868105515587, "frac_reward_zero_std": 1.0, "grad_norm": 0.1630859375, "kl": 0.0411873715929687, "learning_rate": 1.9998704199642762e-05, "loss": 0.0016, "num_tokens": 13458600.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 162.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.314333148865523, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.09529448114335537, "learning_rate": 1.999866941537349e-05, "loss": 0.0038, "num_tokens": 13466371.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 222.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.31451761667589007, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.050843637669458985, "learning_rate": 1.9998634170426558e-05, "loss": 0.002, "num_tokens": 13472632.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 226.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.31470208448625714, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.05248252535238862, "learning_rate": 1.9998598464803587e-05, "loss": 0.0021, "num_tokens": 13478187.0, "reward": 2.2520833015441895, "reward_std": 0.17982304096221924, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25208333134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17982301115989685, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 315.25, "completions/mean_terminated_length": 315.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.3148865522966242, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.08436562027782202, "learning_rate": 1.999856229850623e-05, "loss": 0.0034, "num_tokens": 13487965.0, "reward": 1.375, "reward_std": 0.9346233010292053, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.42082318663597107, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.31507102010699134, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.05916617694310844, "learning_rate": 1.9998525671536147e-05, "loss": 0.0024, "num_tokens": 13497690.0, "reward": 1.4583333730697632, "reward_std": 0.9074208736419678, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7083333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4520675837993622, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.3152554879173584, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0476741271559149, "learning_rate": 1.9998488583895026e-05, "loss": 0.0019, "num_tokens": 13505304.0, "reward": 1.125, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 174.125, "completions/mean_terminated_length": 174.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.3154399557277255, "frac_reward_zero_std": 1.0, "grad_norm": 0.169921875, "kl": 0.06328506069257855, "learning_rate": 1.9998451035584577e-05, "loss": 0.0025, "num_tokens": 13513193.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.3156244235380926, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.05432618292979896, "learning_rate": 1.9998413026606532e-05, "loss": 0.0022, "num_tokens": 13521200.0, "reward": 2.1159911155700684, "reward_std": 0.31871452927589417, "rewards/fixed_code_pass_all_test_reward/mean": 0.824324369430542, "rewards/fixed_code_pass_all_test_reward/std": 0.3067980110645294, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 230.75, "completions/mean_terminated_length": 230.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3158088913484597, "frac_reward_zero_std": 1.0, "grad_norm": 0.34765625, "kl": 0.09627803973853588, "learning_rate": 1.999837455696264e-05, "loss": 0.0039, "num_tokens": 13530174.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 103.75, "completions/mean_terminated_length": 103.75, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.31599335915882676, "frac_reward_zero_std": 1.0, "grad_norm": 1.515625, "kl": 0.1168278472032398, "learning_rate": 1.9998335626654673e-05, "loss": 0.0047, "num_tokens": 13533876.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 249.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.3161778269691939, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.05155725940130651, "learning_rate": 1.9998296235684428e-05, "loss": 0.0021, "num_tokens": 13542258.0, "reward": 2.3333334922790527, "reward_std": 0.5634361505508423, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 255.75, "completions/mean_terminated_length": 255.75, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.31636229477956096, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.04711177374701947, "learning_rate": 1.9998256384053714e-05, "loss": 0.0019, "num_tokens": 13552544.0, "reward": 2.375, "reward_std": 1.0606601238250732, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.31654676258992803, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.09034869819879532, "learning_rate": 1.999821607176437e-05, "loss": 0.0036, "num_tokens": 13560010.0, "reward": 1.625, "reward_std": 0.2781743109226227, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.2781743109226227, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.31673123040029516, "frac_reward_zero_std": 1.0, "grad_norm": 0.2099609375, "kl": 0.06367912655696273, "learning_rate": 1.9998175298818258e-05, "loss": 0.0025, "num_tokens": 13565288.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 387.375, "completions/mean_terminated_length": 387.375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.31691569821066223, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.06437455234117806, "learning_rate": 1.9998134065217248e-05, "loss": 0.0026, "num_tokens": 13573491.0, "reward": 1.0416667461395264, "reward_std": 0.5841830968856812, "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 100.125, "completions/mean_terminated_length": 100.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.3171001660210293, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.10684493090957403, "learning_rate": 1.999809237096325e-05, "loss": 0.0043, "num_tokens": 13577140.0, "reward": 0.96875, "reward_std": 0.4712729752063751, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 640.875, "completions/mean_terminated_length": 640.875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.31728463383139643, "frac_reward_zero_std": 0.0, "grad_norm": 0.37890625, "kl": 0.030473407125100493, "learning_rate": 1.9998050216058173e-05, "loss": 0.0012, "num_tokens": 13593651.0, "reward": 2.0416667461395264, "reward_std": 0.4520675241947174, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3174691016417635, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.07836072100326419, "learning_rate": 1.999800760050397e-05, "loss": 0.0031, "num_tokens": 13599904.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.3176535694521306, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.041290535824373364, "learning_rate": 1.9997964524302595e-05, "loss": 0.0017, "num_tokens": 13609347.0, "reward": 1.9166667461395264, "reward_std": 0.19839003682136536, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.19839002192020416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.3178380372624977, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.13377131568267941, "learning_rate": 1.999792098745604e-05, "loss": 0.0054, "num_tokens": 13615959.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 695.5, "completions/mean_terminated_length": 695.5, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.3180225050728648, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.034487376222386956, "learning_rate": 1.9997876989966313e-05, "loss": 0.0014, "num_tokens": 13627419.0, "reward": 1.4969828128814697, "reward_std": 0.43573373556137085, "rewards/fixed_code_pass_all_test_reward/mean": 0.40948277711868286, "rewards/fixed_code_pass_all_test_reward/std": 0.27303817868232727, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 140.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.31820697288323185, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.10363984433934093, "learning_rate": 1.999783253183543e-05, "loss": 0.0041, "num_tokens": 13635410.0, "reward": 2.339912176132202, "reward_std": 0.13820016384124756, "rewards/fixed_code_pass_all_test_reward/mean": 0.8399122953414917, "rewards/fixed_code_pass_all_test_reward/std": 0.13820016384124756, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.318391440693599, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.07598115922883153, "learning_rate": 1.999778761306545e-05, "loss": 0.003, "num_tokens": 13642025.0, "reward": 1.75, "reward_std": 1.1649646759033203, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.31857590850396605, "frac_reward_zero_std": 0.0, "grad_norm": 0.58203125, "kl": 0.04792942595668137, "learning_rate": 1.9997742233658438e-05, "loss": 0.0019, "num_tokens": 13654231.0, "reward": 2.0202653408050537, "reward_std": 0.1584950089454651, "rewards/fixed_code_pass_all_test_reward/mean": 0.7140151262283325, "rewards/fixed_code_pass_all_test_reward/std": 0.14457878470420837, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.30625003576278687, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.05189848691225052, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 308.625, "completions/mean_terminated_length": 308.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3187603763143331, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.05011031590402126, "learning_rate": 1.9997696393616484e-05, "loss": 0.002, "num_tokens": 13660996.0, "reward": 2.057886838912964, "reward_std": 0.1925724297761917, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.11454054713249207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16502977907657623, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2249545156955719, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.31894484412470026, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.058096221415326, "learning_rate": 1.99976500929417e-05, "loss": 0.0023, "num_tokens": 13669510.0, "reward": 2.7544641494750977, "reward_std": 0.35377877950668335, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9419642686843872, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10788434743881226, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 429.5, "completions/mean_terminated_length": 429.5, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.31912931193506733, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.04178065073210746, "learning_rate": 1.9997603331636223e-05, "loss": 0.0017, "num_tokens": 13685370.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 437.625, "completions/mean_terminated_length": 437.625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.3193137797454344, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.04996235761791468, "learning_rate": 1.9997556109702208e-05, "loss": 0.002, "num_tokens": 13694375.0, "reward": 1.3814903497695923, "reward_std": 0.14428359270095825, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13149037957191467, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.14428363740444183, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 145.125, "completions/mean_terminated_length": 145.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.31949824755580153, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.11665847431868315, "learning_rate": 1.9997508427141824e-05, "loss": 0.0047, "num_tokens": 13702176.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 92.25, "completions/mean_terminated_length": 92.25, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.3196827153661686, "frac_reward_zero_std": 1.0, "grad_norm": 0.32421875, "kl": 0.15271799312904477, "learning_rate": 1.9997460283957272e-05, "loss": 0.0061, "num_tokens": 13705738.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 464.625, "completions/mean_terminated_length": 464.625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.3198671831765357, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.035404902417212725, "learning_rate": 1.999741168015077e-05, "loss": 0.0014, "num_tokens": 13722751.0, "reward": 2.3125, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 104.75, "completions/mean_terminated_length": 104.75, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.3200516509869028, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.14741816651076078, "learning_rate": 1.999736261572456e-05, "loss": 0.0059, "num_tokens": 13728637.0, "reward": 2.5729165077209473, "reward_std": 0.7202258110046387, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.78125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36443448066711426, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 104.5, "completions/mean_terminated_length": 104.5, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.3202361187972699, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.06674479623325169, "learning_rate": 1.9997313090680896e-05, "loss": 0.0027, "num_tokens": 13732681.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 192.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.32042058660763695, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.044696701457723975, "learning_rate": 1.9997263105022065e-05, "loss": 0.0018, "num_tokens": 13742890.0, "reward": 1.9783653020858765, "reward_std": 0.8250001668930054, "rewards/fixed_code_pass_all_test_reward/mean": 0.22836539149284363, "rewards/fixed_code_pass_all_test_reward/std": 0.2238990068435669, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3206050544180041, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.04323675506748259, "learning_rate": 1.999721265875037e-05, "loss": 0.0017, "num_tokens": 13749424.0, "reward": 2.7857141494750977, "reward_std": 0.3967800736427307, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.3967800438404083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.32078952222837115, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.042260923190042377, "learning_rate": 1.999716175186813e-05, "loss": 0.0017, "num_tokens": 13757349.0, "reward": 1.8250000476837158, "reward_std": 0.36154431104660034, "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.36154431104660034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.3209739900387382, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.0717672819737345, "learning_rate": 1.99971103843777e-05, "loss": 0.0029, "num_tokens": 13762869.0, "reward": 1.7416666746139526, "reward_std": 0.3886280059814453, "rewards/fixed_code_pass_all_test_reward/mean": 0.6166666746139526, "rewards/fixed_code_pass_all_test_reward/std": 0.2624669373035431, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 89.125, "completions/mean_terminated_length": 89.125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.32115845784910535, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.1663325596600771, "learning_rate": 1.9997058556281438e-05, "loss": 0.0067, "num_tokens": 13769614.0, "reward": 2.700000047683716, "reward_std": 0.2267785668373108, "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.22677868604660034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3213429256594724, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.07789011904969811, "learning_rate": 1.9997006267581735e-05, "loss": 0.0031, "num_tokens": 13774280.0, "reward": 1.056249976158142, "reward_std": 0.4554726779460907, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.18125000596046448, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17512750625610352, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3215273934698395, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.058512299321591854, "learning_rate": 1.9996953518281002e-05, "loss": 0.0023, "num_tokens": 13779040.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3217118612802066, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.05878856289200485, "learning_rate": 1.9996900308381665e-05, "loss": 0.0024, "num_tokens": 13783660.0, "reward": 2.1076388359069824, "reward_std": 0.7301816940307617, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3576388955116272, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.33839982748031616, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3218963290905737, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.09899926092475653, "learning_rate": 1.999684663788618e-05, "loss": 0.004, "num_tokens": 13788976.0, "reward": 1.78125, "reward_std": 0.7545332908630371, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.40625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38688188791275024, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 71.375, "completions/mean_terminated_length": 71.375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.32208079690094077, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.11495971539989114, "learning_rate": 1.9996792506797017e-05, "loss": 0.0046, "num_tokens": 13792419.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 74.5, "completions/mean_terminated_length": 74.5, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.3222652647113079, "frac_reward_zero_std": 1.0, "grad_norm": 0.4140625, "kl": 0.22097861766815186, "learning_rate": 1.9996737915116672e-05, "loss": 0.0088, "num_tokens": 13800527.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 82.375, "completions/mean_terminated_length": 82.375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.32244973252167497, "frac_reward_zero_std": 0.0, "grad_norm": 3.21875, "kl": 0.22521552070975304, "learning_rate": 1.999668286284766e-05, "loss": 0.009, "num_tokens": 13804138.0, "reward": 2.0416667461395264, "reward_std": 0.11785121262073517, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 68.625, "completions/mean_terminated_length": 68.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.32263420033204204, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.11189127387478948, "learning_rate": 1.9996627349992518e-05, "loss": 0.0045, "num_tokens": 13807559.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 347.125, "completions/mean_terminated_length": 347.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.32281866814240917, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.02638039435259998, "learning_rate": 1.9996571376553797e-05, "loss": 0.0011, "num_tokens": 13815048.0, "reward": 1.8177536725997925, "reward_std": 0.2573033571243286, "rewards/fixed_code_pass_all_test_reward/mean": 0.7010869383811951, "rewards/fixed_code_pass_all_test_reward/std": 0.4387628436088562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.11666667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19354668259620667, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.32300313595277624, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0585814097430557, "learning_rate": 1.9996514942534085e-05, "loss": 0.0023, "num_tokens": 13821467.0, "reward": 1.28125, "reward_std": 0.5135804414749146, "rewards/fixed_code_pass_all_test_reward/mean": 0.40625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 262.25, "completions/mean_terminated_length": 262.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3231876037631433, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.04421175585594028, "learning_rate": 1.999645804793598e-05, "loss": 0.0018, "num_tokens": 13830701.0, "reward": 2.4296875, "reward_std": 0.36164847016334534, "rewards/fixed_code_pass_all_test_reward/mean": 0.4296875, "rewards/fixed_code_pass_all_test_reward/std": 0.36164847016334534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 263.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.32337207157351044, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.06674776878207922, "learning_rate": 1.99964006927621e-05, "loss": 0.0027, "num_tokens": 13843163.0, "reward": 1.7280172109603882, "reward_std": 0.7106781601905823, "rewards/fixed_code_pass_all_test_reward/mean": 0.02801724150776863, "rewards/fixed_code_pass_all_test_reward/std": 0.01828724518418312, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.824999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36154431104660034, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 104.125, "completions/mean_terminated_length": 104.125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.3235565393838775, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.09823434706777334, "learning_rate": 1.9996342877015086e-05, "loss": 0.0039, "num_tokens": 13849660.0, "reward": 2.3333334922790527, "reward_std": 0.7766431570053101, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7083333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4520675837993622, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.3237410071942446, "frac_reward_zero_std": 0.0, "grad_norm": 13.5, "kl": 0.7548284931108356, "learning_rate": 1.9996284600697612e-05, "loss": 0.0302, "num_tokens": 13856099.0, "reward": 2.3499999046325684, "reward_std": 1.0460811853408813, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8500000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3505098223686218, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 224.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.3239254750046117, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.046787052415311337, "learning_rate": 1.999622586381235e-05, "loss": 0.0019, "num_tokens": 13861747.0, "reward": 2.3125, "reward_std": 0.979704737663269, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3241099428149788, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.14024516474455595, "learning_rate": 1.9996166666362016e-05, "loss": 0.0056, "num_tokens": 13872055.0, "reward": 2.2750000953674316, "reward_std": 0.30771294236183167, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.2881408631801605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4625000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1060660108923912, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 331.875, "completions/mean_terminated_length": 331.875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.32429441062534586, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.06233344040811062, "learning_rate": 1.999610700834933e-05, "loss": 0.0025, "num_tokens": 13879846.0, "reward": 1.6963140964508057, "reward_std": 0.2440457046031952, "rewards/fixed_code_pass_all_test_reward/mean": 0.26923078298568726, "rewards/fixed_code_pass_all_test_reward/std": 0.10878566652536392, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4270833432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.313193678855896, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.324478878435713, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.03539640852250159, "learning_rate": 1.999604688977705e-05, "loss": 0.0014, "num_tokens": 13884245.0, "reward": 1.9032738208770752, "reward_std": 0.3688795864582062, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0282738097012043, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.054716601967811584, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.32466334624608006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.07391435978934169, "learning_rate": 1.9995986310647935e-05, "loss": 0.003, "num_tokens": 13890668.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 83.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.32484781405644714, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.12432141043245792, "learning_rate": 1.9995925270964787e-05, "loss": 0.005, "num_tokens": 13896684.0, "reward": 2.575000047683716, "reward_std": 1.0498299598693848, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.824999988079071, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36154431104660034, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 82.75, "completions/mean_terminated_length": 82.75, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.32503228186681427, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.09113128902390599, "learning_rate": 1.999586377073041e-05, "loss": 0.0036, "num_tokens": 13900194.0, "reward": 2.2708334922790527, "reward_std": 0.6103441119194031, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7708333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.250990092754364, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 310.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.32521674967718134, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05973991681821644, "learning_rate": 1.999580180994764e-05, "loss": 0.0024, "num_tokens": 13907118.0, "reward": 1.3822115659713745, "reward_std": 0.3909420967102051, "rewards/fixed_code_pass_all_test_reward/mean": 0.3509615361690521, "rewards/fixed_code_pass_all_test_reward/std": 0.3311379849910736, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.3254012174875484, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.08669393556192517, "learning_rate": 1.9995739388619335e-05, "loss": 0.0035, "num_tokens": 13915841.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.32558568529791554, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.07184387091547251, "learning_rate": 1.9995676506748367e-05, "loss": 0.0029, "num_tokens": 13926644.0, "reward": 1.7225878238677979, "reward_std": 0.34356215596199036, "rewards/fixed_code_pass_all_test_reward/mean": 0.4309210479259491, "rewards/fixed_code_pass_all_test_reward/std": 0.30525586009025574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.3257701531082826, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.11000458616763353, "learning_rate": 1.9995613164337634e-05, "loss": 0.0044, "num_tokens": 13936224.0, "reward": 1.75, "reward_std": 1.1649646759033203, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.3259546209186497, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.08266573213040829, "learning_rate": 1.9995549361390055e-05, "loss": 0.0033, "num_tokens": 13943089.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.3261390887290168, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.11899423412978649, "learning_rate": 1.999548509790857e-05, "loss": 0.0048, "num_tokens": 13954476.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 344.875, "completions/mean_terminated_length": 344.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3263235565393839, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.07267117593437433, "learning_rate": 1.9995420373896138e-05, "loss": 0.0029, "num_tokens": 13961859.0, "reward": 1.1685606241226196, "reward_std": 0.19927677512168884, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.14772728085517883, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1879817545413971, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 309.875, "completions/mean_terminated_length": 309.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.32650802434975096, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.08866861718706787, "learning_rate": 1.9995355189355745e-05, "loss": 0.0035, "num_tokens": 13972738.0, "reward": 2.5978260040283203, "reward_std": 0.4828968346118927, "rewards/fixed_code_pass_all_test_reward/mean": 0.5978261232376099, "rewards/fixed_code_pass_all_test_reward/std": 0.4828967750072479, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3266924921601181, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.0825606626458466, "learning_rate": 1.999528954429039e-05, "loss": 0.0033, "num_tokens": 13981565.0, "reward": 1.7589285373687744, "reward_std": 0.3670811355113983, "rewards/fixed_code_pass_all_test_reward/mean": 0.2589285671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.3670811057090759, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 130.25, "completions/mean_terminated_length": 130.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.32687695997048516, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.08046220755204558, "learning_rate": 1.99952234387031e-05, "loss": 0.0032, "num_tokens": 13985671.0, "reward": 2.75, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 135.875, "completions/mean_terminated_length": 135.875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.32706142778085223, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.097203747369349, "learning_rate": 1.999515687259692e-05, "loss": 0.0039, "num_tokens": 13989574.0, "reward": 2.9583334922790527, "reward_std": 0.11785107105970383, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.3272458955912193, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.05791820981539786, "learning_rate": 1.9995089845974916e-05, "loss": 0.0023, "num_tokens": 13994116.0, "reward": 2.2749998569488525, "reward_std": 0.6497251987457275, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4000000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.42761802673339844, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 492.125, "completions/mean_terminated_length": 492.125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.32743036340158643, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.05569779872894287, "learning_rate": 1.999502235884018e-05, "loss": 0.0022, "num_tokens": 14003989.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.3276148312119535, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.17794070579111576, "learning_rate": 1.999495441119582e-05, "loss": 0.0071, "num_tokens": 14011823.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 326.875, "completions/mean_terminated_length": 326.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.3277992990223206, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.08261857368052006, "learning_rate": 1.999488600304496e-05, "loss": 0.0033, "num_tokens": 14019166.0, "reward": 2.37886905670166, "reward_std": 0.43589070439338684, "rewards/fixed_code_pass_all_test_reward/mean": 0.8080357313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.3764851689338684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5708333253860474, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25522321462631226, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 312.125, "completions/mean_terminated_length": 312.125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.3279837668326877, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.07136417971923947, "learning_rate": 1.999481713439076e-05, "loss": 0.0029, "num_tokens": 14029423.0, "reward": 1.524999976158142, "reward_std": 0.5119988918304443, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2750000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45276927947998047, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.3281682346430548, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.052419818472117186, "learning_rate": 1.9994747805236386e-05, "loss": 0.0021, "num_tokens": 14037117.0, "reward": 1.2215909957885742, "reward_std": 0.19654259085655212, "rewards/fixed_code_pass_all_test_reward/mean": 0.15909090638160706, "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.32835270245342185, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.06916391744744033, "learning_rate": 1.999467801558504e-05, "loss": 0.0028, "num_tokens": 14042791.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 445.625, "completions/mean_terminated_length": 445.625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.328537170263789, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.05075894994661212, "learning_rate": 1.9994607765439933e-05, "loss": 0.002, "num_tokens": 14051708.0, "reward": 1.84375, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.32872163807415605, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.09388176258653402, "learning_rate": 1.9994537054804307e-05, "loss": 0.0038, "num_tokens": 14058849.0, "reward": 2.28125, "reward_std": 0.45193037390708923, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 89.125, "completions/mean_terminated_length": 89.125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3289061058845231, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.5496026035398245, "learning_rate": 1.9994465883681407e-05, "loss": 0.022, "num_tokens": 14064674.0, "reward": 2.4791667461395264, "reward_std": 1.0367742776870728, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7291666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39778652787208557, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.32909057369489025, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.09343323763459921, "learning_rate": 1.9994394252074528e-05, "loss": 0.0037, "num_tokens": 14071475.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3292750415052573, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.10993748344480991, "learning_rate": 1.999432215998696e-05, "loss": 0.0044, "num_tokens": 14081211.0, "reward": 1.937206506729126, "reward_std": 0.12962059676647186, "rewards/fixed_code_pass_all_test_reward/mean": 0.6038732528686523, "rewards/fixed_code_pass_all_test_reward/std": 0.12962065637111664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.3294595093156244, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.12890854012221098, "learning_rate": 1.9994249607422027e-05, "loss": 0.0052, "num_tokens": 14090575.0, "reward": 2.9000000953674316, "reward_std": 0.2828427255153656, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8999999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2828427255153656, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.3296439771259915, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.09636975638568401, "learning_rate": 1.9994176594383074e-05, "loss": 0.0039, "num_tokens": 14096840.0, "reward": 1.0860215425491333, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.08602150529623032, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 345.25, "completions/mean_terminated_length": 345.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3298284449363586, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.06944653484970331, "learning_rate": 1.9994103120873462e-05, "loss": 0.0028, "num_tokens": 14104298.0, "reward": 1.9547102451324463, "reward_std": 0.17553380131721497, "rewards/fixed_code_pass_all_test_reward/mean": 0.9130434989929199, "rewards/fixed_code_pass_all_test_reward/std": 0.09296044707298279, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0416666679084301, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511455655098, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 634.125, "completions/mean_terminated_length": 634.125, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.3300129127467257, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.04349310416728258, "learning_rate": 1.9994029186896578e-05, "loss": 0.0017, "num_tokens": 14117259.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 131.875, "completions/mean_terminated_length": 131.875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.3301973805570928, "frac_reward_zero_std": 1.0, "grad_norm": 0.19921875, "kl": 0.18040384072810411, "learning_rate": 1.999395479245583e-05, "loss": 0.0072, "num_tokens": 14124642.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 526.375, "completions/mean_terminated_length": 526.375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.3303818483674599, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.058563423808664083, "learning_rate": 1.9993879937554638e-05, "loss": 0.0023, "num_tokens": 14137109.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.33056631617782695, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.10442663403227925, "learning_rate": 1.9993804622196457e-05, "loss": 0.0042, "num_tokens": 14141147.0, "reward": 2.0250000953674316, "reward_std": 0.07071065902709961, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 305.875, "completions/mean_terminated_length": 305.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3307507839881941, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.07954065082594752, "learning_rate": 1.999372884638476e-05, "loss": 0.0032, "num_tokens": 14150378.0, "reward": 1.5625, "reward_std": 0.3204349875450134, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 305.875, "completions/mean_terminated_length": 305.875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.33093525179856115, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.10067116562277079, "learning_rate": 1.9993652610123036e-05, "loss": 0.004, "num_tokens": 14159721.0, "reward": 2.5625, "reward_std": 0.6232117414474487, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 475.75, "completions/mean_terminated_length": 475.75, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.3311197196089282, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.07389162434265018, "learning_rate": 1.9993575913414793e-05, "loss": 0.003, "num_tokens": 14168183.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.33130418741929535, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.1241244999691844, "learning_rate": 1.999349875626357e-05, "loss": 0.005, "num_tokens": 14176904.0, "reward": 2.2916667461395264, "reward_std": 0.2920914888381958, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022762298584, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.3314886552296624, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.1269928142428398, "learning_rate": 1.999342113867292e-05, "loss": 0.0051, "num_tokens": 14182671.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 694.0, "completions/mean_terminated_length": 500.5714416503906, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.3316731230400295, "frac_reward_zero_std": 0.0, "grad_norm": 0.71484375, "kl": 0.08323066617595032, "learning_rate": 1.999334306064642e-05, "loss": 0.0033, "num_tokens": 14193871.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 397.125, "completions/mean_terminated_length": 397.125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.3318575908503966, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0931785604916513, "learning_rate": 1.9993264522187662e-05, "loss": 0.0037, "num_tokens": 14203944.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 371.75, "completions/mean_terminated_length": 371.75, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3320420586607637, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.07114658132195473, "learning_rate": 1.999318552330027e-05, "loss": 0.0028, "num_tokens": 14216598.0, "reward": 1.5485820770263672, "reward_std": 0.6257769465446472, "rewards/fixed_code_pass_all_test_reward/mean": 0.45483195781707764, "rewards/fixed_code_pass_all_test_reward/std": 0.1839679628610611, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 789.375, "completions/mean_terminated_length": 609.5714721679688, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.33222652647113077, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.05749957077205181, "learning_rate": 1.9993106063987887e-05, "loss": 0.0023, "num_tokens": 14228697.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.3324109942814979, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.12065221462398767, "learning_rate": 1.9993026144254167e-05, "loss": 0.0048, "num_tokens": 14236876.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 374.25, "completions/mean_terminated_length": 374.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.33259546209186497, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.059401087230071425, "learning_rate": 1.9992945764102797e-05, "loss": 0.0024, "num_tokens": 14247726.0, "reward": 2.0304055213928223, "reward_std": 0.525353729724884, "rewards/fixed_code_pass_all_test_reward/mean": 0.7179054021835327, "rewards/fixed_code_pass_all_test_reward/std": 0.340065062046051, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.33277992990223204, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.06631910637952387, "learning_rate": 1.9992864923537472e-05, "loss": 0.0027, "num_tokens": 14252646.0, "reward": 2.125, "reward_std": 0.9910312294960022, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.33296439771259917, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.05251416750252247, "learning_rate": 1.999278362256193e-05, "loss": 0.0021, "num_tokens": 14258510.0, "reward": 2.5208334922790527, "reward_std": 0.3706761300563812, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.33407655358314514, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8333333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022613286972, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.33314886552296624, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.12362776836380363, "learning_rate": 1.9992701861179907e-05, "loss": 0.0049, "num_tokens": 14262599.0, "reward": 2.518749952316284, "reward_std": 0.825459897518158, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.768750011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3769592046737671, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 425.25, "completions/mean_terminated_length": 193.42857360839844, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.06584466644562781, "learning_rate": 1.9992619639395174e-05, "loss": 0.0026, "num_tokens": 14269185.0, "reward": 1.649999976158142, "reward_std": 0.9304376840591431, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4000000059604645, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5014265775680542, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 448.0, "completions/mean_terminated_length": 448.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.33351780114370044, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.1319788594264537, "learning_rate": 1.999253695721152e-05, "loss": 0.0053, "num_tokens": 14279785.0, "reward": 2.8272058963775635, "reward_std": 0.3358749747276306, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8272058963775635, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3358750641345978, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3337022689540675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.039784320746548474, "learning_rate": 1.999245381463275e-05, "loss": 0.0016, "num_tokens": 14285293.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.3338867367644346, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.08788242749869823, "learning_rate": 1.9992370211662698e-05, "loss": 0.0035, "num_tokens": 14294023.0, "reward": 2.1500000953674316, "reward_std": 1.3554123640060425, "rewards/fixed_code_pass_all_test_reward/mean": 0.6499999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.4869731664657593, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 459.25, "completions/mean_terminated_length": 459.25, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.3340712045748017, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.061259130015969276, "learning_rate": 1.9992286148305217e-05, "loss": 0.0025, "num_tokens": 14302161.0, "reward": 1.7756803035736084, "reward_std": 0.22479496896266937, "rewards/fixed_code_pass_all_test_reward/mean": 0.6173469424247742, "rewards/fixed_code_pass_all_test_reward/std": 0.18917889893054962, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.15833333134651184, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24992063641548157, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.3342556723851688, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.11502136010676622, "learning_rate": 1.9992201624564176e-05, "loss": 0.0046, "num_tokens": 14310696.0, "reward": 1.6041667461395264, "reward_std": 0.7011187076568604, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7291666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39778652787208557, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.33444014019553586, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.04093519598245621, "learning_rate": 1.999211664044347e-05, "loss": 0.0016, "num_tokens": 14325975.0, "reward": 2.4895834922790527, "reward_std": 0.11301694065332413, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5208333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 246.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.334624608005903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.07050333521328866, "learning_rate": 1.999203119594702e-05, "loss": 0.0028, "num_tokens": 14335229.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 243.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.33480907581627006, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.043219553772360086, "learning_rate": 1.999194529107876e-05, "loss": 0.0017, "num_tokens": 14340825.0, "reward": 1.9791667461395264, "reward_std": 0.41963234543800354, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1041666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1460457742214203, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.33499354362663714, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.07232912117615342, "learning_rate": 1.9991858925842647e-05, "loss": 0.0029, "num_tokens": 14349424.0, "reward": 1.6869643926620483, "reward_std": 0.18485957384109497, "rewards/fixed_code_pass_all_test_reward/mean": 0.11999999731779099, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5669642686843872, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18485954403877258, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 99.375, "completions/mean_terminated_length": 99.375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.33517801143700426, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.12879951344802976, "learning_rate": 1.9991772100242657e-05, "loss": 0.0052, "num_tokens": 14353083.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 555.75, "completions/mean_terminated_length": 555.75, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.33536247924737134, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.03882534697186202, "learning_rate": 1.9991684814282798e-05, "loss": 0.0016, "num_tokens": 14363881.0, "reward": 1.3345239162445068, "reward_std": 0.23967842757701874, "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.16532503068447113, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06666667014360428, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12848322093486786, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.3355469470577384, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.0692939218133688, "learning_rate": 1.9991597067967084e-05, "loss": 0.0028, "num_tokens": 14372526.0, "reward": 2.3541667461395264, "reward_std": 0.726141631603241, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8541666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.27368009090423584, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.33573141486810554, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.07051007775589824, "learning_rate": 1.999150886129956e-05, "loss": 0.0028, "num_tokens": 14377241.0, "reward": 2.8333334922790527, "reward_std": 0.35634830594062805, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 329.25, "completions/mean_terminated_length": 329.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.3359158826784726, "frac_reward_zero_std": 0.0, "grad_norm": 0.6796875, "kl": 0.034073733258992434, "learning_rate": 1.9991420194284294e-05, "loss": 0.0014, "num_tokens": 14385003.0, "reward": 2.7916667461395264, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 106.0, "completions/mean_terminated_length": 106.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.3361003504888397, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.09820293495431542, "learning_rate": 1.999133106692537e-05, "loss": 0.0039, "num_tokens": 14395595.0, "reward": 2.230769157409668, "reward_std": 0.49851855635643005, "rewards/fixed_code_pass_all_test_reward/mean": 0.8557692170143127, "rewards/fixed_code_pass_all_test_reward/std": 0.2670634984970093, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.3362848182992068, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.09046329138800502, "learning_rate": 1.999124147922689e-05, "loss": 0.0036, "num_tokens": 14399446.0, "reward": 2.5625, "reward_std": 0.4206712245941162, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4206712543964386, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 114.125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.3364692861095739, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.11479423381388187, "learning_rate": 1.9991151431192986e-05, "loss": 0.0046, "num_tokens": 14403351.0, "reward": 2.53125, "reward_std": 0.5077524185180664, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.53125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5077524185180664, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.33665375391994096, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.06326928921043873, "learning_rate": 1.99910609228278e-05, "loss": 0.0025, "num_tokens": 14412572.0, "reward": 1.75, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.3368382217303081, "frac_reward_zero_std": 0.0, "grad_norm": 0.7578125, "kl": 0.07123136520385742, "learning_rate": 1.9990969954135514e-05, "loss": 0.0028, "num_tokens": 14420545.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 210.875, "completions/mean_terminated_length": 210.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.33702268954067516, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.05515572102740407, "learning_rate": 1.999087852512031e-05, "loss": 0.0022, "num_tokens": 14425888.0, "reward": 2.581249952316284, "reward_std": 0.6927881240844727, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.706250011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.39136892557144165, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 70.625, "completions/mean_terminated_length": 70.625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.33720715735104223, "frac_reward_zero_std": 1.0, "grad_norm": 0.515625, "kl": 0.20465335249900818, "learning_rate": 1.9990786635786407e-05, "loss": 0.0082, "num_tokens": 14429181.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 303.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.33739162516140936, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.11962447827681899, "learning_rate": 1.9990694286138028e-05, "loss": 0.0048, "num_tokens": 14436645.0, "reward": 1.9000000953674316, "reward_std": 0.4690415561199188, "rewards/fixed_code_pass_all_test_reward/mean": 0.21250000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 84.25, "completions/mean_terminated_length": 84.25, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.33757609297177643, "frac_reward_zero_std": 0.0, "grad_norm": 4.1875, "kl": 0.13384659495204687, "learning_rate": 1.999060147617944e-05, "loss": 0.0054, "num_tokens": 14440239.0, "reward": 2.5833334922790527, "reward_std": 0.5841829776763916, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 327.75, "completions/mean_terminated_length": 327.75, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.3377605607821435, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.0396632154006511, "learning_rate": 1.9990508205914912e-05, "loss": 0.0016, "num_tokens": 14447797.0, "reward": 1.9107143878936768, "reward_std": 0.15152285993099213, "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 220.0, "completions/mean_terminated_length": 220.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.33794502859251063, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.06338231917470694, "learning_rate": 1.9990414475348743e-05, "loss": 0.0025, "num_tokens": 14456717.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.3381294964028777, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.07275934983044863, "learning_rate": 1.999032028448525e-05, "loss": 0.0029, "num_tokens": 14465041.0, "reward": 2.8026316165924072, "reward_std": 0.38238826394081116, "rewards/fixed_code_pass_all_test_reward/mean": 0.8026316165924072, "rewards/fixed_code_pass_all_test_reward/std": 0.38238832354545593, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3383139642132448, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.04164484702050686, "learning_rate": 1.999022563332878e-05, "loss": 0.0017, "num_tokens": 14470750.0, "reward": 2.2916667461395264, "reward_std": 0.3181045353412628, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.31810450553894043, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.3384984320236119, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.06345146708190441, "learning_rate": 1.9990130521883683e-05, "loss": 0.0025, "num_tokens": 14480720.0, "reward": 2.795454502105713, "reward_std": 0.39101481437683105, "rewards/fixed_code_pass_all_test_reward/mean": 0.7954545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.39101478457450867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 171.375, "completions/mean_terminated_length": 171.375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.338682899833979, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.07435441692359746, "learning_rate": 1.999003495015435e-05, "loss": 0.003, "num_tokens": 14489883.0, "reward": 2.1706438064575195, "reward_std": 0.11905807256698608, "rewards/fixed_code_pass_all_test_reward/mean": 0.8477272987365723, "rewards/fixed_code_pass_all_test_reward/std": 0.12329117208719254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3229166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.02946278639137745, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.33886736764434605, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.06695546675473452, "learning_rate": 1.998993891814518e-05, "loss": 0.0027, "num_tokens": 14494906.0, "reward": 2.84375, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 447.5, "completions/mean_terminated_length": 447.5, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.3390518354547132, "frac_reward_zero_std": 0.0, "grad_norm": 0.62890625, "kl": 0.022143507259897888, "learning_rate": 1.9989842425860598e-05, "loss": 0.0009, "num_tokens": 14507870.0, "reward": 2.78125, "reward_std": 0.41052013635635376, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.33923630326508025, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.1544039361178875, "learning_rate": 1.9989745473305052e-05, "loss": 0.0062, "num_tokens": 14514571.0, "reward": 2.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 476.75, "completions/mean_terminated_length": 476.75, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.3394207710754473, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.02607944398187101, "learning_rate": 1.9989648060483005e-05, "loss": 0.001, "num_tokens": 14523297.0, "reward": 1.9500000476837158, "reward_std": 0.18516403436660767, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.012500000186264515, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0353553406894207, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.3396052388858144, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.056319600669667125, "learning_rate": 1.9989550187398947e-05, "loss": 0.0023, "num_tokens": 14528645.0, "reward": 1.9389097690582275, "reward_std": 0.05050760135054588, "rewards/fixed_code_pass_all_test_reward/mean": 0.9210526347160339, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.01785714365541935, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.05050762742757797, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 662.25, "completions/mean_terminated_length": 662.25, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.3397897066961815, "frac_reward_zero_std": 0.0, "grad_norm": 0.56640625, "kl": 0.035137681872583926, "learning_rate": 1.998945185405739e-05, "loss": 0.0014, "num_tokens": 14541095.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 94.75, "completions/mean_terminated_length": 94.75, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.3399741745065486, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.10221932642161846, "learning_rate": 1.9989353060462862e-05, "loss": 0.0041, "num_tokens": 14544813.0, "reward": 2.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.34015864231691567, "frac_reward_zero_std": 0.0, "grad_norm": 3.5625, "kl": 0.08797835931181908, "learning_rate": 1.9989253806619914e-05, "loss": 0.0035, "num_tokens": 14548532.0, "reward": 1.9249999523162842, "reward_std": 0.3845219314098358, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.3403431101272828, "frac_reward_zero_std": 1.0, "grad_norm": 0.2353515625, "kl": 0.0748425661586225, "learning_rate": 1.9989154092533124e-05, "loss": 0.003, "num_tokens": 14556110.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.3405275779376499, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.13242147373966873, "learning_rate": 1.998905391820708e-05, "loss": 0.0053, "num_tokens": 14563113.0, "reward": 2.625, "reward_std": 1.0606601238250732, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.34071204574801695, "frac_reward_zero_std": 0.0, "grad_norm": 5.71875, "kl": 0.06708682049065828, "learning_rate": 1.9988953283646397e-05, "loss": 0.0027, "num_tokens": 14567910.0, "reward": 1.8093750476837158, "reward_std": 0.34742358326911926, "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, "rewards/fixed_code_pass_all_test_reward/std": 0.31649234890937805, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.3408965135583841, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.06378243118524551, "learning_rate": 1.998885218885572e-05, "loss": 0.0026, "num_tokens": 14574725.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.34108098136875115, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.05323595111258328, "learning_rate": 1.99887506338397e-05, "loss": 0.0021, "num_tokens": 14583312.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 81.625, "completions/mean_terminated_length": 81.625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.3412654491791182, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.15114852041006088, "learning_rate": 1.9988648618603013e-05, "loss": 0.006, "num_tokens": 14586677.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 566.625, "completions/mean_terminated_length": 566.625, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.34144991698948535, "frac_reward_zero_std": 0.0, "grad_norm": 0.51953125, "kl": 0.027974896132946014, "learning_rate": 1.998854614315037e-05, "loss": 0.0011, "num_tokens": 14603058.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3416343847998524, "frac_reward_zero_std": 1.0, "grad_norm": 0.185546875, "kl": 0.144367097876966, "learning_rate": 1.9988443207486485e-05, "loss": 0.0058, "num_tokens": 14609982.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.3418188526102195, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.15631168568506837, "learning_rate": 1.9988339811616097e-05, "loss": 0.0063, "num_tokens": 14615348.0, "reward": 2.5625, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3420033204205866, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.14924551453441381, "learning_rate": 1.998823595554398e-05, "loss": 0.006, "num_tokens": 14625525.0, "reward": 2.8839285373687744, "reward_std": 0.3282996118068695, "rewards/fixed_code_pass_all_test_reward/mean": 0.8839285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.32829955220222473, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.3421877882309537, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.11445667198859155, "learning_rate": 1.998813163927491e-05, "loss": 0.0046, "num_tokens": 14632330.0, "reward": 2.9583334922790527, "reward_std": 0.11785107105970383, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.34237225604132077, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.10998689197003841, "learning_rate": 1.9988026862813696e-05, "loss": 0.0044, "num_tokens": 14639002.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 217.625, "completions/mean_terminated_length": 217.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.3425567238516879, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.09724067687056959, "learning_rate": 1.998792162616517e-05, "loss": 0.0039, "num_tokens": 14644863.0, "reward": 1.5858417749404907, "reward_std": 0.5958560705184937, "rewards/fixed_code_pass_all_test_reward/mean": 0.35459184646606445, "rewards/fixed_code_pass_all_test_reward/std": 0.4130808115005493, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.23125000298023224, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3769592046737671, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.34274119166205497, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.1015463792718947, "learning_rate": 1.9987815929334175e-05, "loss": 0.0041, "num_tokens": 14651439.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8999999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2828427255153656, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.34292565947242204, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.058484770357608795, "learning_rate": 1.9987709772325583e-05, "loss": 0.0023, "num_tokens": 14660369.0, "reward": 2.0999999046325684, "reward_std": 0.37032803893089294, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9750000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106739282608, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 73.125, "completions/mean_terminated_length": 73.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.34311012728278917, "frac_reward_zero_std": 0.0, "grad_norm": 5.8125, "kl": 0.21171819232404232, "learning_rate": 1.9987603155144282e-05, "loss": 0.0085, "num_tokens": 14663866.0, "reward": 2.28125, "reward_std": 0.6999680995941162, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.40625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4988826811313629, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 320.75, "completions/mean_terminated_length": 320.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.34329459509315624, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.11874036025255919, "learning_rate": 1.9987496077795196e-05, "loss": 0.0047, "num_tokens": 14670488.0, "reward": 1.421875, "reward_std": 0.5745242834091187, "rewards/fixed_code_pass_all_test_reward/mean": 0.546875, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 141.875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.3434790629035233, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.13019677717238665, "learning_rate": 1.998738854028324e-05, "loss": 0.0052, "num_tokens": 14674399.0, "reward": 2.075000047683716, "reward_std": 0.5750776529312134, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.32499998807907104, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4652188718318939, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 127.125, "completions/mean_terminated_length": 127.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.34366353071389044, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.1014368524774909, "learning_rate": 1.9987280542613385e-05, "loss": 0.0041, "num_tokens": 14678200.0, "reward": 1.625, "reward_std": 0.6813851594924927, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18898223340511322, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 241.625, "completions/mean_terminated_length": 241.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3438479985242575, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.06385029968805611, "learning_rate": 1.9987172084790597e-05, "loss": 0.0026, "num_tokens": 14684093.0, "reward": 2.6666667461395264, "reward_std": 0.35634830594062805, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35634830594062805, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 91.25, "completions/mean_terminated_length": 91.25, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.3440324663346246, "frac_reward_zero_std": 0.0, "grad_norm": 3.640625, "kl": 0.16557954251766205, "learning_rate": 1.998706316681988e-05, "loss": 0.0066, "num_tokens": 14687735.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 314.0, "completions/mean_terminated_length": 314.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3442169341449917, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0807813173159957, "learning_rate": 1.9986953788706243e-05, "loss": 0.0032, "num_tokens": 14694687.0, "reward": 1.491279125213623, "reward_std": 0.6096696853637695, "rewards/fixed_code_pass_all_test_reward/mean": 0.36627906560897827, "rewards/fixed_code_pass_all_test_reward/std": 0.256191223859787, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.3444014019553588, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.1280690310522914, "learning_rate": 1.9986843950454736e-05, "loss": 0.0051, "num_tokens": 14702231.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.34458586976572586, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.08178892126306891, "learning_rate": 1.9986733652070415e-05, "loss": 0.0033, "num_tokens": 14711592.0, "reward": 2.2431435585021973, "reward_std": 0.16785375773906708, "rewards/fixed_code_pass_all_test_reward/mean": 0.9098101258277893, "rewards/fixed_code_pass_all_test_reward/std": 0.1678536832332611, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 386.0, "completions/mean_terminated_length": 386.0, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.344770337576093, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.06164172571152449, "learning_rate": 1.998662289355836e-05, "loss": 0.0025, "num_tokens": 14720504.0, "reward": 1.9600000381469727, "reward_std": 0.1614222526550293, "rewards/fixed_code_pass_all_test_reward/mean": 0.934999942779541, "rewards/fixed_code_pass_all_test_reward/std": 0.1316922903060913, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.34495480538646006, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.16080760769546032, "learning_rate": 1.9986511674923675e-05, "loss": 0.0064, "num_tokens": 14729152.0, "reward": 2.0166666507720947, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.3499999940395355, "rewards/fixed_code_pass_all_test_reward/std": 0.2828426957130432, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 394.25, "completions/mean_terminated_length": 394.25, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.34513927319682713, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.052138290368020535, "learning_rate": 1.998639999617148e-05, "loss": 0.0021, "num_tokens": 14737834.0, "reward": 1.7638888359069824, "reward_std": 0.4382001459598541, "rewards/fixed_code_pass_all_test_reward/mean": 0.7638888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.4382002055644989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.34532374100719426, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.08237110171467066, "learning_rate": 1.9986287857306935e-05, "loss": 0.0033, "num_tokens": 14746517.0, "reward": 2.40625, "reward_std": 0.4868050217628479, "rewards/fixed_code_pass_all_test_reward/mean": 0.7604166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.44418084621429443, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6458333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.34550820881756134, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.10290414374321699, "learning_rate": 1.9986175258335193e-05, "loss": 0.0041, "num_tokens": 14756679.0, "reward": 2.4708333015441895, "reward_std": 0.6481812000274658, "rewards/fixed_code_pass_all_test_reward/mean": 0.7208333015441895, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 209.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.3456926766279284, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.15073998365551233, "learning_rate": 1.9986062199261446e-05, "loss": 0.006, "num_tokens": 14764940.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.34587714443829554, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.1279759258031845, "learning_rate": 1.9985948680090904e-05, "loss": 0.0051, "num_tokens": 14772689.0, "reward": 2.0, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 68.25, "completions/mean_terminated_length": 68.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.3460616122486626, "frac_reward_zero_std": 0.0, "grad_norm": 4.125, "kl": 0.39817653223872185, "learning_rate": 1.9985834700828793e-05, "loss": 0.0159, "num_tokens": 14779611.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 393.75, "completions/mean_terminated_length": 393.75, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.3462460800590297, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0844306075014174, "learning_rate": 1.998572026148037e-05, "loss": 0.0034, "num_tokens": 14793049.0, "reward": 2.223360538482666, "reward_std": 0.19942279160022736, "rewards/fixed_code_pass_all_test_reward/mean": 0.22336065769195557, "rewards/fixed_code_pass_all_test_reward/std": 0.19942280650138855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.3464305478693968, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.0769329397007823, "learning_rate": 1.9985605362050905e-05, "loss": 0.0031, "num_tokens": 14799900.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 83.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3466150156797639, "frac_reward_zero_std": 0.0, "grad_norm": 3.5, "kl": 0.22452333383262157, "learning_rate": 1.9985490002545696e-05, "loss": 0.009, "num_tokens": 14803428.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.34679948349013096, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.10260514169931412, "learning_rate": 1.998537418297005e-05, "loss": 0.0041, "num_tokens": 14812641.0, "reward": 2.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.3469839513004981, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.12592312414199114, "learning_rate": 1.9985257903329307e-05, "loss": 0.005, "num_tokens": 14822290.0, "reward": 2.4375, "reward_std": 0.7288690805435181, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.21576867997646332, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 285.75, "completions/mean_terminated_length": 285.75, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.34716841911086516, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.08007983257994056, "learning_rate": 1.9985141163628827e-05, "loss": 0.0032, "num_tokens": 14831720.0, "reward": 2.1517858505249023, "reward_std": 0.3424264192581177, "rewards/fixed_code_pass_all_test_reward/mean": 0.6517857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.2140730321407318, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.26726123690605164, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 475.125, "completions/mean_terminated_length": 475.125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.34735288692123223, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.0664110560901463, "learning_rate": 1.998502396387399e-05, "loss": 0.0027, "num_tokens": 14840825.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.34753735473159936, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.08059221040457487, "learning_rate": 1.998490630407019e-05, "loss": 0.0032, "num_tokens": 14853041.0, "reward": 2.030172348022461, "reward_std": 0.012191422283649445, "rewards/fixed_code_pass_all_test_reward/mean": 0.03017241321504116, "rewards/fixed_code_pass_all_test_reward/std": 0.012191496789455414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 509.25, "completions/mean_terminated_length": 509.25, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.34772182254196643, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.04528188332915306, "learning_rate": 1.998478818422285e-05, "loss": 0.0018, "num_tokens": 14863971.0, "reward": 1.125, "reward_std": 0.07715167850255966, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.07715167850255966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 78.875, "completions/mean_terminated_length": 78.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3479062903523335, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.18220077361911535, "learning_rate": 1.9984669604337412e-05, "loss": 0.0073, "num_tokens": 14867346.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.34809075816270063, "frac_reward_zero_std": 1.0, "grad_norm": 0.328125, "kl": 0.1517836218699813, "learning_rate": 1.9984550564419342e-05, "loss": 0.0061, "num_tokens": 14874579.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3482752259730677, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.1713462695479393, "learning_rate": 1.9984431064474125e-05, "loss": 0.0069, "num_tokens": 14880299.0, "reward": 1.6458333730697632, "reward_std": 0.5523219108581543, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.41547447443008423, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.3484596937834348, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.12334079947322607, "learning_rate": 1.998431110450726e-05, "loss": 0.0049, "num_tokens": 14892095.0, "reward": 2.6785712242126465, "reward_std": 0.20863677561283112, "rewards/fixed_code_pass_all_test_reward/mean": 0.7098214626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.14397767186164856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.3486441615938019, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.09370084200054407, "learning_rate": 1.998419068452428e-05, "loss": 0.0037, "num_tokens": 14903839.0, "reward": 2.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 110.125, "completions/mean_terminated_length": 110.125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.348828629404169, "frac_reward_zero_std": 1.0, "grad_norm": 0.326171875, "kl": 0.1761025651358068, "learning_rate": 1.9984069804530737e-05, "loss": 0.007, "num_tokens": 14907824.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.34901309721453605, "frac_reward_zero_std": 0.0, "grad_norm": 3.5, "kl": 0.32534983195364475, "learning_rate": 1.998394846453219e-05, "loss": 0.013, "num_tokens": 14914457.0, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 78.25, "completions/mean_terminated_length": 78.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3491975650249032, "frac_reward_zero_std": 1.0, "grad_norm": 0.31640625, "kl": 0.2159548532217741, "learning_rate": 1.9983826664534235e-05, "loss": 0.0086, "num_tokens": 14917803.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.34938203283527025, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.12778811110183597, "learning_rate": 1.998370440454249e-05, "loss": 0.0051, "num_tokens": 14927628.0, "reward": 1.9646738767623901, "reward_std": 0.13908712565898895, "rewards/fixed_code_pass_all_test_reward/mean": 0.6521739363670349, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13908715546131134, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3495665006456373, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.08655348140746355, "learning_rate": 1.9983581684562576e-05, "loss": 0.0035, "num_tokens": 14934306.0, "reward": 1.4970238208770752, "reward_std": 0.6883544325828552, "rewards/fixed_code_pass_all_test_reward/mean": 0.5595238208770752, "rewards/fixed_code_pass_all_test_reward/std": 0.2773725390434265, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.34975096845600445, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.07929408922791481, "learning_rate": 1.9983458504600154e-05, "loss": 0.0032, "num_tokens": 14941158.0, "reward": 1.475000023841858, "reward_std": 0.10350984334945679, "rewards/fixed_code_pass_all_test_reward/mean": 0.4749999940395355, "rewards/fixed_code_pass_all_test_reward/std": 0.10350985080003738, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3499354362663715, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.22036558017134666, "learning_rate": 1.99833348646609e-05, "loss": 0.0088, "num_tokens": 14948286.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3501199040767386, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.09227775642648339, "learning_rate": 1.998321076475051e-05, "loss": 0.0037, "num_tokens": 14953541.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 491.875, "completions/mean_terminated_length": 491.875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.3503043718871057, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.0425558106508106, "learning_rate": 1.99830862048747e-05, "loss": 0.0017, "num_tokens": 14963332.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.3504888396974728, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.11677976977080107, "learning_rate": 1.9982961185039208e-05, "loss": 0.0047, "num_tokens": 14971461.0, "reward": 2.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.35067330750783987, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.09896660782396793, "learning_rate": 1.9982835705249797e-05, "loss": 0.004, "num_tokens": 14978644.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 104.0, "completions/mean_terminated_length": 104.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.350857775318207, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.17950161080807447, "learning_rate": 1.9982709765512246e-05, "loss": 0.0072, "num_tokens": 14982300.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.35104224312857407, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.1236040792427957, "learning_rate": 1.998258336583236e-05, "loss": 0.0049, "num_tokens": 14989252.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.35122671093894114, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.15165470959618688, "learning_rate": 1.9982456506215963e-05, "loss": 0.0061, "num_tokens": 14998509.0, "reward": 2.198106050491333, "reward_std": 0.1767766773700714, "rewards/fixed_code_pass_all_test_reward/mean": 0.9272727370262146, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2708333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3514111787493083, "frac_reward_zero_std": 1.0, "grad_norm": 0.474609375, "kl": 0.15038663428276777, "learning_rate": 1.9982329186668893e-05, "loss": 0.006, "num_tokens": 15006771.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 86.875, "completions/mean_terminated_length": 86.875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.35159564655967535, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.09504899801686406, "learning_rate": 1.9982201407197025e-05, "loss": 0.0038, "num_tokens": 15010402.0, "reward": 2.0250000953674316, "reward_std": 0.07071065902709961, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 235.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.3517801143700424, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.11514808121137321, "learning_rate": 1.9982073167806245e-05, "loss": 0.0046, "num_tokens": 15016297.0, "reward": 1.3583333492279053, "reward_std": 0.07071070373058319, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 147.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3519645821804095, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.11622571898624301, "learning_rate": 1.9981944468502457e-05, "loss": 0.0046, "num_tokens": 15023394.0, "reward": 2.841911792755127, "reward_std": 0.44714102149009705, "rewards/fixed_code_pass_all_test_reward/mean": 0.904411792755127, "rewards/fixed_code_pass_all_test_reward/std": 0.270364373922348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 516.25, "completions/mean_terminated_length": 516.25, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.3521490499907766, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.05949487304314971, "learning_rate": 1.9981815309291594e-05, "loss": 0.0024, "num_tokens": 15038140.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 86.625, "completions/mean_terminated_length": 86.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3523335178011437, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "kl": 0.19986925413832068, "learning_rate": 1.9981685690179607e-05, "loss": 0.008, "num_tokens": 15043313.0, "reward": 1.1875, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.35251798561151076, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.07188594690524042, "learning_rate": 1.998155561117246e-05, "loss": 0.0029, "num_tokens": 15049835.0, "reward": 1.9237070083618164, "reward_std": 0.38862910866737366, "rewards/fixed_code_pass_all_test_reward/mean": 0.8362069129943848, "rewards/fixed_code_pass_all_test_reward/std": 0.3041248619556427, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 355.125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.3527024534218779, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.07597842044197023, "learning_rate": 1.9981425072276163e-05, "loss": 0.003, "num_tokens": 15061604.0, "reward": 2.232142925262451, "reward_std": 0.3657134771347046, "rewards/fixed_code_pass_all_test_reward/mean": 0.2321428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3657134771347046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.35288692123224497, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.09784332569688559, "learning_rate": 1.9981294073496715e-05, "loss": 0.0039, "num_tokens": 15066775.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 372.875, "completions/mean_terminated_length": 372.875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.35307138904261204, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.050527811516076326, "learning_rate": 1.9981162614840156e-05, "loss": 0.002, "num_tokens": 15077270.0, "reward": 1.9211537837982178, "reward_std": 0.005439279600977898, "rewards/fixed_code_pass_all_test_reward/mean": 0.9211539030075073, "rewards/fixed_code_pass_all_test_reward/std": 0.005439282860606909, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.35325585685297917, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.05347632826305926, "learning_rate": 1.9981030696312548e-05, "loss": 0.0021, "num_tokens": 15082975.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.35344032466334624, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.08123844861984253, "learning_rate": 1.9980898317919964e-05, "loss": 0.0032, "num_tokens": 15090756.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1220.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.3536247924737133, "frac_reward_zero_std": 0.0, "grad_norm": 0.6640625, "kl": 0.10789238382130861, "learning_rate": 1.99807654796685e-05, "loss": 0.0043, "num_tokens": 15099404.0, "reward": 1.88671875, "reward_std": 0.3204077482223511, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.88671875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.32040777802467346, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.35380926028408044, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.21368352603167295, "learning_rate": 1.9980632181564288e-05, "loss": 0.0085, "num_tokens": 15104272.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3539937280944475, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.09536077780649066, "learning_rate": 1.9980498423613456e-05, "loss": 0.0038, "num_tokens": 15113057.0, "reward": 2.168604850769043, "reward_std": 0.20454168319702148, "rewards/fixed_code_pass_all_test_reward/mean": 0.6686046719551086, "rewards/fixed_code_pass_all_test_reward/std": 0.2045416384935379, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 403.0, "completions/mean_terminated_length": 403.0, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.3541781959048146, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.08096634945832193, "learning_rate": 1.9980364205822177e-05, "loss": 0.0032, "num_tokens": 15126377.0, "reward": 2.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 318.875, "completions/mean_terminated_length": 318.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.3543626637151817, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.06141480104997754, "learning_rate": 1.9980229528196625e-05, "loss": 0.0025, "num_tokens": 15133296.0, "reward": 1.2986111640930176, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.4236111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.3545471315255488, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.07755677937529981, "learning_rate": 1.9980094390743016e-05, "loss": 0.0031, "num_tokens": 15143556.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.35473159933591586, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.11406587343662977, "learning_rate": 1.9979958793467573e-05, "loss": 0.0046, "num_tokens": 15151697.0, "reward": 2.1979167461395264, "reward_std": 0.23543907701969147, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6979166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23543906211853027, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.354916067146283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.07705437438562512, "learning_rate": 1.997982273637654e-05, "loss": 0.0031, "num_tokens": 15160887.0, "reward": 1.5283019542694092, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5283018946647644, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 61.5, "completions/mean_terminated_length": 61.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.35510053495665006, "frac_reward_zero_std": 0.0, "grad_norm": 3.46875, "kl": 0.21782657504081726, "learning_rate": 1.9979686219476184e-05, "loss": 0.0087, "num_tokens": 15164083.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.35528500276701713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0498046875, "kl": 0.062443120405077934, "learning_rate": 1.9979549242772804e-05, "loss": 0.0025, "num_tokens": 15173660.0, "reward": 2.3333334922790527, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3333333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.35546947057738426, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.08941149362362921, "learning_rate": 1.99794118062727e-05, "loss": 0.0036, "num_tokens": 15181476.0, "reward": 2.575000047683716, "reward_std": 0.3845219910144806, "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.24928469955921173, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.26726123690605164, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 98.25, "completions/mean_terminated_length": 98.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.35565393838775133, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.11172611778602004, "learning_rate": 1.9979273909982213e-05, "loss": 0.0045, "num_tokens": 15185038.0, "reward": 2.34375, "reward_std": 0.5499594211578369, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 259.75, "completions/mean_terminated_length": 259.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.3558384061981184, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.06567403580993414, "learning_rate": 1.997913555390769e-05, "loss": 0.0026, "num_tokens": 15196196.0, "reward": 2.640625, "reward_std": 0.07365693897008896, "rewards/fixed_code_pass_all_test_reward/mean": 0.9739583730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.07365695387125015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.35602287400848553, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.15467501245439053, "learning_rate": 1.997899673805551e-05, "loss": 0.0062, "num_tokens": 15206020.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.3562073418188526, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.08599948277696967, "learning_rate": 1.997885746243207e-05, "loss": 0.0034, "num_tokens": 15212992.0, "reward": 1.5267858505249023, "reward_std": 0.5303301215171814, "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 277.125, "completions/mean_terminated_length": 277.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.3563918096292197, "frac_reward_zero_std": 0.0, "grad_norm": 0.75, "kl": 0.06023971736431122, "learning_rate": 1.997871772704378e-05, "loss": 0.0024, "num_tokens": 15219625.0, "reward": 2.9166667461395264, "reward_std": 0.15430328249931335, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15430334210395813, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.3565762774395868, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.13451551459729671, "learning_rate": 1.9978577531897086e-05, "loss": 0.0054, "num_tokens": 15225045.0, "reward": 1.4541666507720947, "reward_std": 0.2700308859348297, "rewards/fixed_code_pass_all_test_reward/mean": 0.3500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.15430335700511932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1041666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1460457742214203, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 363.75, "completions/mean_terminated_length": 363.75, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.3567607452499539, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.04186659469269216, "learning_rate": 1.9978436876998442e-05, "loss": 0.0017, "num_tokens": 15233507.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.35694521306032095, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.16415533609688282, "learning_rate": 1.9978295762354333e-05, "loss": 0.0066, "num_tokens": 15240420.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 440.75, "completions/mean_terminated_length": 440.75, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.3571296808706881, "frac_reward_zero_std": 0.0, "grad_norm": 0.65234375, "kl": 0.07117517455480993, "learning_rate": 1.9978154187971253e-05, "loss": 0.0028, "num_tokens": 15252938.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.35731414868105515, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.13475200906395912, "learning_rate": 1.9978012153855736e-05, "loss": 0.0054, "num_tokens": 15257147.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.3574986164914222, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.10383280087262392, "learning_rate": 1.9977869660014316e-05, "loss": 0.0042, "num_tokens": 15264774.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.35768308430178936, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.1605069856159389, "learning_rate": 1.9977726706453564e-05, "loss": 0.0064, "num_tokens": 15272795.0, "reward": 1.75, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.37796446681022644, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.35786755211215643, "frac_reward_zero_std": 1.0, "grad_norm": 0.169921875, "kl": 0.1380249415524304, "learning_rate": 1.9977583293180065e-05, "loss": 0.0055, "num_tokens": 15277697.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.3580520199225235, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.0976374801248312, "learning_rate": 1.9977439420200424e-05, "loss": 0.0039, "num_tokens": 15282536.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 316.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.35823648773289063, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.07998363231308758, "learning_rate": 1.9977295087521274e-05, "loss": 0.0032, "num_tokens": 15293250.0, "reward": 2.5, "reward_std": 0.4225771427154541, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.4225771427154541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 244.0, "completions/mean_terminated_length": 244.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.3584209555432577, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.07960667787119746, "learning_rate": 1.9977150295149263e-05, "loss": 0.0032, "num_tokens": 15299466.0, "reward": 1.9346591234207153, "reward_std": 0.37362140417099, "rewards/fixed_code_pass_all_test_reward/mean": 0.8409091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.2945791482925415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13684004545211792, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.3586054233536248, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.12725011259317398, "learning_rate": 1.9977005043091062e-05, "loss": 0.0051, "num_tokens": 15305346.0, "reward": 1.77734375, "reward_std": 0.505118191242218, "rewards/fixed_code_pass_all_test_reward/mean": 0.71484375, "rewards/fixed_code_pass_all_test_reward/std": 0.44334033131599426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3587898911639919, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.11097543966025114, "learning_rate": 1.9976859331353363e-05, "loss": 0.0044, "num_tokens": 15309952.0, "reward": 2.0833334922790527, "reward_std": 0.5194624662399292, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7083333730697632, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3646045923233032, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.358974358974359, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.04173485143110156, "learning_rate": 1.9976713159942884e-05, "loss": 0.0017, "num_tokens": 15316121.0, "reward": 2.1845240592956543, "reward_std": 0.16581423580646515, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1845238208770752, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16581416130065918, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 237.125, "completions/mean_terminated_length": 237.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.35915882678472605, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.09389797504991293, "learning_rate": 1.997656652886635e-05, "loss": 0.0038, "num_tokens": 15321610.0, "reward": 2.862499952316284, "reward_std": 0.25599947571754456, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 254.875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3593432945950932, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.09978897264227271, "learning_rate": 1.9976419438130526e-05, "loss": 0.004, "num_tokens": 15327609.0, "reward": 1.9568965435028076, "reward_std": 0.12191498279571533, "rewards/fixed_code_pass_all_test_reward/mean": 0.9568965435028076, "rewards/fixed_code_pass_all_test_reward/std": 0.12191496044397354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 247.875, "completions/mean_terminated_length": 247.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.35952776240546025, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0715919523499906, "learning_rate": 1.9976271887742185e-05, "loss": 0.0029, "num_tokens": 15332512.0, "reward": 2.4872024059295654, "reward_std": 0.263190895318985, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.48720237612724304, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2631908357143402, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 185.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3597122302158273, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.11100334487855434, "learning_rate": 1.9976123877708123e-05, "loss": 0.0044, "num_tokens": 15337174.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.35989669802619445, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.21310610696673393, "learning_rate": 1.9975975408035166e-05, "loss": 0.0085, "num_tokens": 15344382.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.3600811658365615, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.11336820106953382, "learning_rate": 1.997582647873015e-05, "loss": 0.0045, "num_tokens": 15350974.0, "reward": 2.1666667461395264, "reward_std": 0.3563483655452728, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1666666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.35634833574295044, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3602656336469286, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.08156200684607029, "learning_rate": 1.997567708979994e-05, "loss": 0.0033, "num_tokens": 15356325.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 308.75, "completions/mean_terminated_length": 308.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.3604501014572957, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.06885905331000686, "learning_rate": 1.9975527241251412e-05, "loss": 0.0028, "num_tokens": 15362923.0, "reward": 1.774999976158142, "reward_std": 0.7206147909164429, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3606345692676628, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.108709083404392, "learning_rate": 1.9975376933091477e-05, "loss": 0.0043, "num_tokens": 15369047.0, "reward": 1.7295918464660645, "reward_std": 0.10101527720689774, "rewards/fixed_code_pass_all_test_reward/mean": 0.6938775777816772, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0357142873108387, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10101525485515594, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 232.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.36081903707802987, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.14605716848745942, "learning_rate": 1.9975226165327058e-05, "loss": 0.0058, "num_tokens": 15377913.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 246.375, "completions/mean_terminated_length": 246.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.361003504888397, "frac_reward_zero_std": 1.0, "grad_norm": 0.173828125, "kl": 0.11706452444195747, "learning_rate": 1.9975074937965102e-05, "loss": 0.0047, "num_tokens": 15387988.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 377.375, "completions/mean_terminated_length": 377.375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.36118797269876407, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.06217347295023501, "learning_rate": 1.9974923251012573e-05, "loss": 0.0025, "num_tokens": 15395487.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 467.0, "completions/mean_terminated_length": 467.0, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.36137244050913114, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.07284733513370156, "learning_rate": 1.9974771104476467e-05, "loss": 0.0029, "num_tokens": 15407871.0, "reward": 2.8333332538604736, "reward_std": 0.18184833228588104, "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.18689274787902832, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 582.5, "completions/mean_terminated_length": 582.5, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.36155690831949827, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.04987058020196855, "learning_rate": 1.9974618498363787e-05, "loss": 0.002, "num_tokens": 15421603.0, "reward": 1.9567902088165283, "reward_std": 0.017459416761994362, "rewards/fixed_code_pass_all_test_reward/mean": 0.9567901492118835, "rewards/fixed_code_pass_all_test_reward/std": 0.017459416761994362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.36174137612986534, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.07829526206478477, "learning_rate": 1.9974465432681567e-05, "loss": 0.0031, "num_tokens": 15428583.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.3619258439402324, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.08368597831577063, "learning_rate": 1.9974311907436862e-05, "loss": 0.0033, "num_tokens": 15437431.0, "reward": 1.9642856121063232, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.2142857164144516, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.36211031175059955, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.09019808145239949, "learning_rate": 1.9974157922636738e-05, "loss": 0.0036, "num_tokens": 15447202.0, "reward": 2.0289976596832275, "reward_std": 0.2172408252954483, "rewards/fixed_code_pass_all_test_reward/mean": 0.7685810327529907, "rewards/fixed_code_pass_all_test_reward/std": 0.21663057804107666, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2604166865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.02946278639137745, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.3622947795609666, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.04287515196483582, "learning_rate": 1.9974003478288298e-05, "loss": 0.0017, "num_tokens": 15454366.0, "reward": 2.090277910232544, "reward_std": 0.8616549968719482, "rewards/fixed_code_pass_all_test_reward/mean": 0.8611111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.3501070737838745, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3541666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2260337918996811, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.3624792473713337, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.10006074793636799, "learning_rate": 1.9973848574398652e-05, "loss": 0.004, "num_tokens": 15458898.0, "reward": 1.1116070747375488, "reward_std": 0.18254506587982178, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1116071417927742, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18254508078098297, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 382.5, "completions/mean_terminated_length": 382.5, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3626637151817008, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.10393276019021869, "learning_rate": 1.9973693210974944e-05, "loss": 0.0042, "num_tokens": 15471142.0, "reward": 1.875, "reward_std": 0.7598558664321899, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.27779194712638855, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13908715546131134, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 350.0, "completions/mean_terminated_length": 350.0, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.3628481829920679, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.07284492207691073, "learning_rate": 1.997353738802432e-05, "loss": 0.0029, "num_tokens": 15479166.0, "reward": 1.65625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.36303265080243496, "frac_reward_zero_std": 0.0, "grad_norm": 6.625, "kl": 0.079692960716784, "learning_rate": 1.9973381105553975e-05, "loss": 0.0032, "num_tokens": 15490639.0, "reward": 1.5911016464233398, "reward_std": 0.7516381144523621, "rewards/fixed_code_pass_all_test_reward/mean": 0.21610169112682343, "rewards/fixed_code_pass_all_test_reward/std": 0.23446084558963776, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.26726123690605164, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3632171186128021, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.07703633978962898, "learning_rate": 1.9973224363571098e-05, "loss": 0.0031, "num_tokens": 15495039.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.36340158642316917, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.10803650598973036, "learning_rate": 1.9973067162082916e-05, "loss": 0.0043, "num_tokens": 15502987.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.36358605423353624, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.09606982348486781, "learning_rate": 1.9972909501096666e-05, "loss": 0.0038, "num_tokens": 15513501.0, "reward": 2.764204502105713, "reward_std": 0.36966145038604736, "rewards/fixed_code_pass_all_test_reward/mean": 0.7954545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.37874457240104675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 206.5, "completions/mean_terminated_length": 206.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.36377052204390337, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.08005368756130338, "learning_rate": 1.997275138061962e-05, "loss": 0.0032, "num_tokens": 15518953.0, "reward": 1.7291667461395264, "reward_std": 0.6295469999313354, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1041666716337204, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19795581698417664, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.36395498985427044, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.10371293220669031, "learning_rate": 1.9972592800659056e-05, "loss": 0.0041, "num_tokens": 15527264.0, "reward": 2.6666667461395264, "reward_std": 0.6900655627250671, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7916666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 75.25, "completions/mean_terminated_length": 75.25, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.3641394576646375, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.13309737341478467, "learning_rate": 1.9972433761222288e-05, "loss": 0.0053, "num_tokens": 15532970.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.3643239254750046, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.13952760491520166, "learning_rate": 1.9972274262316635e-05, "loss": 0.0056, "num_tokens": 15541893.0, "reward": 2.3839287757873535, "reward_std": 0.5548454523086548, "rewards/fixed_code_pass_all_test_reward/mean": 0.4776785969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.37793436646461487, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.90625, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2651650309562683, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.3645083932853717, "frac_reward_zero_std": 1.0, "grad_norm": 0.2890625, "kl": 0.10792180243879557, "learning_rate": 1.997211430394945e-05, "loss": 0.0043, "num_tokens": 15550722.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.3646928610957388, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.10275135794654489, "learning_rate": 1.9971953886128105e-05, "loss": 0.0041, "num_tokens": 15556295.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.36487732890610586, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.10265644080936909, "learning_rate": 1.9971793008859993e-05, "loss": 0.0041, "num_tokens": 15564072.0, "reward": 2.9000000953674316, "reward_std": 0.2828427255153656, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8999999761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2828427255153656, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.365061796716473, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.06384616647846997, "learning_rate": 1.9971631672152518e-05, "loss": 0.0026, "num_tokens": 15574571.0, "reward": 2.877840995788574, "reward_std": 0.3455181419849396, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.96875, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 478.125, "completions/mean_terminated_length": 478.125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.36524626452684006, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.047544458881020546, "learning_rate": 1.9971469876013118e-05, "loss": 0.0019, "num_tokens": 15586324.0, "reward": 2.4312500953674316, "reward_std": 0.7185986042022705, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5562499761581421, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.48213624954223633, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.36543073233720713, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.0832887296564877, "learning_rate": 1.9971307620449244e-05, "loss": 0.0033, "num_tokens": 15590685.0, "reward": 1.7999999523162842, "reward_std": 0.38544961810112, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.800000011920929, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38544967770576477, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 336.125, "completions/mean_terminated_length": 336.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.36561520014757426, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.04836056614294648, "learning_rate": 1.997114490546838e-05, "loss": 0.0019, "num_tokens": 15597670.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.36579966795794133, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.05576870799995959, "learning_rate": 1.997098173107802e-05, "loss": 0.0022, "num_tokens": 15601882.0, "reward": 2.6847405433654785, "reward_std": 0.5456593036651611, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8097402453422546, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.21280480921268463, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.3659841357683084, "frac_reward_zero_std": 1.0, "grad_norm": 1.25, "kl": 0.23473039991222322, "learning_rate": 1.9970818097285672e-05, "loss": 0.0094, "num_tokens": 15606359.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.36616860357867553, "frac_reward_zero_std": 0.0, "grad_norm": 0.5703125, "kl": 0.05718266265466809, "learning_rate": 1.997065400409889e-05, "loss": 0.0023, "num_tokens": 15618829.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3663530713890426, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.094646031036973, "learning_rate": 1.9970489451525225e-05, "loss": 0.0038, "num_tokens": 15626367.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3665375391994097, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.11408851714804769, "learning_rate": 1.997032443957226e-05, "loss": 0.0046, "num_tokens": 15636487.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 499.5, "completions/mean_terminated_length": 499.5, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.3667220070097768, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.04207992274314165, "learning_rate": 1.9970158968247603e-05, "loss": 0.0017, "num_tokens": 15651179.0, "reward": 2.0026042461395264, "reward_std": 0.9438894391059875, "rewards/fixed_code_pass_all_test_reward/mean": 0.4609375, "rewards/fixed_code_pass_all_test_reward/std": 0.503267765045166, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6666666865348816, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.30860671401023865, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3669064748201439, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.060371272498741746, "learning_rate": 1.9969993037558872e-05, "loss": 0.0024, "num_tokens": 15655520.0, "reward": 2.8499999046325684, "reward_std": 0.232993021607399, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8500000238418579, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23299294710159302, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.36709094263051095, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.10946754133328795, "learning_rate": 1.996982664751371e-05, "loss": 0.0044, "num_tokens": 15659564.0, "reward": 2.594494104385376, "reward_std": 0.30773094296455383, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.7194940447807312, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.22325514256954193, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 222.875, "completions/mean_terminated_length": 222.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3672754104408781, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.091189319267869, "learning_rate": 1.996965979811979e-05, "loss": 0.0036, "num_tokens": 15667291.0, "reward": 1.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.36745987825124515, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.10312006995081902, "learning_rate": 1.9969492489384802e-05, "loss": 0.0041, "num_tokens": 15672744.0, "reward": 1.7763888835906982, "reward_std": 0.5404976606369019, "rewards/fixed_code_pass_all_test_reward/mean": 0.6597222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.4696279466152191, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.24166667461395264, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.26829859614372253, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.3676443460616122, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.09347045049071312, "learning_rate": 1.9969324721316444e-05, "loss": 0.0037, "num_tokens": 15679905.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.36782881387197935, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.1225358908995986, "learning_rate": 1.996915649392245e-05, "loss": 0.0049, "num_tokens": 15688025.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 122.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.3680132816823464, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.09335365984588861, "learning_rate": 1.996898780721057e-05, "loss": 0.0037, "num_tokens": 15694655.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3681977494927135, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.10209297249093652, "learning_rate": 1.9968818661188583e-05, "loss": 0.0041, "num_tokens": 15701965.0, "reward": 2.28125, "reward_std": 0.6469364762306213, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.78125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.41052016615867615, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.36838221730308063, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.11355351656675339, "learning_rate": 1.9968649055864274e-05, "loss": 0.0045, "num_tokens": 15709555.0, "reward": 3.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3685666851134477, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.11062825005501509, "learning_rate": 1.9968478991245463e-05, "loss": 0.0044, "num_tokens": 15713761.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 203.25, "completions/mean_terminated_length": 203.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.3687511529238148, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.09633964719250798, "learning_rate": 1.996830846733998e-05, "loss": 0.0039, "num_tokens": 15719611.0, "reward": 1.8333333730697632, "reward_std": 0.6725926995277405, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4583333432674408, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38575834035873413, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 402.375, "completions/mean_terminated_length": 402.375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.3689356207341819, "frac_reward_zero_std": 1.0, "grad_norm": 0.0322265625, "kl": 0.033554904628545046, "learning_rate": 1.996813748415569e-05, "loss": 0.0013, "num_tokens": 15728678.0, "reward": 1.7291667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7291666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, "step": 2000 } ], "logging_steps": 1, "max_steps": 16263, "num_input_tokens_seen": 15728678, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }